From 538cd0c685dbe6fcbb85bb7bbf27ac854d16df80 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Tue, 15 Jun 2021 12:49:02 +0100
Subject: [PATCH 01/20] Add summaries, also some f-strings and black

---
 pairtools/pairtools_stats.py | 446 ++++++++++++++++++++---------------
 1 file changed, 262 insertions(+), 184 deletions(-)

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 0b56a380..2064d71a 100644
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import io
@@ -11,63 +10,62 @@
 
 from . import _fileio, _pairsam_format, cli, _headerops, common_io_options
 
-UTIL_NAME = 'pairtools_stats'
-
-@cli.command()
-
-@click.argument(
-    'input_path', 
-    type=str,
-    nargs=-1,
-    required=False)
+UTIL_NAME = "pairtools_stats"
 
-@click.option(
-    '-o', "--output", 
-    type=str, 
-    default="", 
-    help='output stats tsv file.')
 
+@cli.command()
+@click.argument("input_path", type=str, nargs=-1, required=False)
+@click.option("-o", "--output", type=str, default="", help="output stats tsv file.")
 @click.option(
-    "--merge", 
+    "--merge",
     is_flag=True,
-    help='If specified, merge multiple input stats files instead of calculating'
-        ' statistics of a .pairs/.pairsam file. Merging is performed via summation of'
-        ' all overlapping statistics. Non-overlapping statistics are appended to'
-        ' the end of the file.',
-    )
-
+    help="If specified, merge multiple input stats files instead of calculating"
+    " statistics of a .pairs/.pairsam file. Merging is performed via summation of"
+    " all overlapping statistics. Non-overlapping statistics are appended to"
+    " the end of the file.",
+)
 @common_io_options
-
 def stats(input_path, output, merge, **kwargs):
-    '''Calculate pairs statistics. 
+    """Calculate pairs statistics.
 
     INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
     If not provided, the input is read from stdin.
-    If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number 
+    If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number
     of stats files to merge.
-    
-    The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. 
-    '''
+
+    The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
+    """
     stats_py(input_path, output, merge, **kwargs)
 
+
 def stats_py(input_path, output, merge, **kwargs):
     if merge:
         do_merge(output, input_path, **kwargs)
         return
 
-    instream = (_fileio.auto_open(input_path[0], mode='r', 
-                                  nproc=kwargs.get('nproc_in'),
-                                  command=kwargs.get('cmd_in', None)) 
-                if input_path else sys.stdin)
-    outstream = (_fileio.auto_open(output, mode='w', 
-                                   nproc=kwargs.get('nproc_out'),
-                                   command=kwargs.get('cmd_out', None)) 
-                 if output else sys.stdout)
-
+    instream = (
+        _fileio.auto_open(
+            input_path[0],
+            mode="r",
+            nproc=kwargs.get("nproc_in"),
+            command=kwargs.get("cmd_in", None),
+        )
+        if input_path
+        else sys.stdin
+    )
+    outstream = (
+        _fileio.auto_open(
+            output,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+        if output
+        else sys.stdout
+    )
 
     header, body_stream = _headerops.get_header(instream)
 
-
     # new stats class stuff would come here ...
     stats = PairCounter()
 
@@ -86,28 +84,28 @@ def stats_py(input_path, output, merge, **kwargs):
         # pair type:
         pair_type = cols[_pairsam_format.COL_PTYPE]
 
-        stats.add_pair(chrom1, pos1, strand1,
-                       chrom2, pos2, strand2,
-                       pair_type)
-
+        stats.add_pair(chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type)
+    stats.calculate_summaries()
 
     # save statistics to file ...
     stats.save(outstream)
 
-
-
     if instream != sys.stdin:
         instream.close()
     if outstream != sys.stdout:
         outstream.close()
 
+
 def do_merge(output, files_to_merge, **kwargs):
     # Parse all stats files.
     stats = []
     for stat_file in files_to_merge:
-        f = _fileio.auto_open(stat_file, mode='r', 
-                      nproc=kwargs.get('nproc_in'),
-                      command=kwargs.get('cmd_in', None))
+        f = _fileio.auto_open(
+            stat_file,
+            mode="r",
+            nproc=kwargs.get("nproc_in"),
+            command=kwargs.get("cmd_in", None),
+        )
         # use a factory method to instanciate PairCounter
         stat = PairCounter.from_file(f)
         stats.append(stat)
@@ -117,17 +115,27 @@ def do_merge(output, files_to_merge, **kwargs):
     out_stat = sum(stats)
 
     # Save merged stats.
-    outstream = _fileio.auto_open(output, mode='w', 
-                                  nproc=kwargs.get('nproc_out'),
-                                  command=kwargs.get('cmd_out', None))
+    outstream = _fileio.auto_open(
+        output,
+        mode="w",
+        nproc=kwargs.get("nproc_out"),
+        command=kwargs.get("cmd_out", None),
+    )
 
     # save statistics to file ...
     out_stat.save(outstream)
-        
+
     if outstream != sys.stdout:
         outstream.close()
 
 
+def complexity_estimate(total_mapped, total_dups):
+    try:
+        return total_mapped ** 2 / total_dups / 2
+    except ZeroDivisionError:
+        return 0
+
+
 class PairCounter(Mapping):
     """
     A Counter for Hi-C pairs that accumulates various statistics.
@@ -141,58 +149,76 @@ class PairCounter(Mapping):
     -- multiple PairCounters can be merged via addition.
     """
 
-    _SEP = '\t'
-    _KEY_SEP = '/'
+    _SEP = "\t"
+    _KEY_SEP = "/"
 
     def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25):
         self._stat = OrderedDict()
         # some variables used for initialization:
         # genomic distance bining for the ++/--/-+/+- distribution
-        self._dist_bins = (np.r_[0,
-            np.round(10**np.arange(min_log10_dist, max_log10_dist+0.001, 
-                                   log10_dist_bin_step))
-            .astype(np.int)]
-        )
+        self._dist_bins = np.r_[
+            0,
+            np.round(
+                10
+                ** np.arange(
+                    min_log10_dist, max_log10_dist + 0.001, log10_dist_bin_step
+                )
+            ).astype(np.int),
+        ]
 
         # establish structure of an empty _stat:
-        self._stat['total'] = 0
-        self._stat['total_unmapped'] = 0
-        self._stat['total_single_sided_mapped'] = 0
+        self._stat["total"] = 0
+        self._stat["total_unmapped"] = 0
+        self._stat["total_single_sided_mapped"] = 0
         # total_mapped = total_dups + total_nodups
-        self._stat['total_mapped'] = 0
-        self._stat['total_dups'] = 0
-        self._stat['total_nodups'] = 0
+        self._stat["total_mapped"] = 0
+        self._stat["total_dups"] = 0
+        self._stat["total_nodups"] = 0
         ########################################
         # the rest of stats are based on nodups:
         ########################################
-        self._stat['cis'] = 0
-        self._stat['trans'] = 0
-        self._stat['pair_types'] = {}
+        self._stat["cis"] = 0
+        self._stat["trans"] = 0
+        self._stat["pair_types"] = {}
         # to be removed:
-        self._stat['dedup'] = {}
-
-        self._stat['cis_1kb+'] = 0
-        self._stat['cis_2kb+'] = 0
-        self._stat['cis_4kb+'] = 0
-        self._stat['cis_10kb+'] = 0
-        self._stat['cis_20kb+'] = 0
-        self._stat['cis_40kb+'] = 0
-
-        self._stat['chrom_freq'] = OrderedDict()
-
-        self._stat['dist_freq'] = OrderedDict([
-            ('+-', np.zeros(len(self._dist_bins), dtype=np.int)),
-            ('-+', np.zeros(len(self._dist_bins), dtype=np.int)),
-            ('--', np.zeros(len(self._dist_bins), dtype=np.int)),
-            ('++', np.zeros(len(self._dist_bins), dtype=np.int)),
-            ])
-
+        self._stat["dedup"] = {}
+
+        self._stat["cis_1kb+"] = 0
+        self._stat["cis_2kb+"] = 0
+        self._stat["cis_4kb+"] = 0
+        self._stat["cis_10kb+"] = 0
+        self._stat["cis_20kb+"] = 0
+        self._stat["cis_40kb+"] = 0
+
+        self._stat["chrom_freq"] = OrderedDict()
+
+        self._stat["dist_freq"] = OrderedDict(
+            [
+                ("+-", np.zeros(len(self._dist_bins), dtype=np.int)),
+                ("-+", np.zeros(len(self._dist_bins), dtype=np.int)),
+                ("--", np.zeros(len(self._dist_bins), dtype=np.int)),
+                ("++", np.zeros(len(self._dist_bins), dtype=np.int)),
+            ]
+        )
+        # Summaries are derived from other stats and are recalculated on merge
+        self._stat["summary"] = OrderedDict(
+            [
+                ("frac_cis", 0),
+                ("frac_cis_1kb+", 0),
+                ("frac_cis_2kb+", 0),
+                ("frac_cis_4kb+", 0),
+                ("frac_cis_10kb+", 0),
+                ("frac_cis_20kb+", 0),
+                ("frac_cis_40kb+", 0),
+                ("complexity", 0),
+            ]
+        )
 
     def __getitem__(self, key):
         if isinstance(key, str):
             # let's strip any unintentional '/'
             # from either side of the key
-            key = key.strip('/')
+            key = key.strip("/")
             if self._KEY_SEP in key:
                 # multi-key to access nested elements
                 k_fields = key.split(self._KEY_SEP)
@@ -202,32 +228,33 @@ def __getitem__(self, key):
                 return self._stat[key]
         else:
             # clearly an error:
-            raise ValueError(
-                '{} is not a valid key: must be str'.format(key))
+            raise ValueError(f"{key} is not a valid key: must be str")
 
         # K_FIELDS:
         # process multi-key case:
         # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup']
         # get the first 'k' and keep the remainders in 'k_fields'
         k = k_fields.pop(0)
-        if k in ['pair_types', 'dedup']:
+        if k in ["pair_types", "dedup"]:
             # assert there is only one element in key_fields left:
             # 'pair_types' and 'dedup' treated the same
             if len(k_fields) == 1:
                 return self._stat[k][k_fields[0]]
             else:
                 raise ValueError(
-                    '{} is not a valid key: {} section implies 1 identifier'.format(key, k))
+                    f"{key} is not a valid key: {k} section implies 1 identifier"
+                )
 
-        elif k == 'chrom_freq':
+        elif k == "chrom_freq":
             # assert remaining key_fields == [chr1, chr2]:
             if len(k_fields) == 2:
                 return self._stat[k][tuple(k_fields)]
             else:
                 raise ValueError(
-                    '{} is not a valid key: {} section implies 2 identifiers'.format(key, k))
+                    f"{key} is not a valid key: {k} section implies 2 identifiers"
+                )
 
-        elif k == 'dist_freq':
+        elif k == "dist_freq":
             # assert that last element of key_fields is the 'directions'
             # THIS IS DONE FOR CONSISTENCY WITH .stats FILE
             # SHOULD THAT BE CHANGED IN .stats AND HERE AS WELL?
@@ -235,29 +262,53 @@ def __getitem__(self, key):
                 # assert 'dirs' in ['++','--','+-','-+']
                 dirs = k_fields.pop()
                 # there is only genomic distance range of the bin that's left:
-                bin_range, = k_fields
+                (bin_range,) = k_fields
                 # extract left border of the bin "1000000+" or "1500-6000":
-                dist_bin_left = bin_range.strip('+') if bin_range.endswith('+') \
-                            else bin_range.split('-')[0]
+                dist_bin_left = (
+                    bin_range.strip("+")
+                    if bin_range.endswith("+")
+                    else bin_range.split("-")[0]
+                )
                 # get the index of that bin:
-                bin_idx = np.searchsorted(self._dist_bins, int(dist_bin_left), 'right') - 1
+                bin_idx = (
+                    np.searchsorted(self._dist_bins, int(dist_bin_left), "right") - 1
+                )
                 # store corresponding value:
-                return self._stat['dist_freq'][dirs][bin_idx]
+                return self._stat["dist_freq"][dirs][bin_idx]
             else:
                 raise ValueError(
-                    '{} is not a valid key: {} section implies 2 identifiers'.format(key,k))
+                    f"{key} is not a valid key: {k} section implies 2 identifiers"
+                )
         else:
-            raise ValueError(
-                '{} is not a valid key'.format(k))
-
+            raise ValueError(f"{k} is not a valid key")
 
     def __iter__(self):
         return iter(self._stat)
 
-
     def __len__(self):
         return len(self._stat)
 
+    def calculate_summaries(self):
+        """calculate summary statistics (fraction of cis pairs at different cutoffs,
+        complexity estimate) based on accumulated counts. Results are saved into
+        self._stat['summary']
+
+        """
+        for cis_count in (
+            "cis",
+            "cis_1kb+",
+            "cis_2kb+",
+            "cis_4kb+",
+            "cis_10kb+",
+            "cis_20kb+",
+            "cis_40kb+",
+        ):
+            self._stat["summary"][f"frac_{cis_count}"] = (
+                self._stat[cis_count] / self._stat["total_nodups"]
+            )
+        self._stat["summary"]["complexity"] = complexity_estimate(
+            self._stat["total_mapped"], self._stat["total_dups"]
+        )
 
     @classmethod
     def from_file(cls, file_handle):
@@ -275,73 +326,93 @@ def from_file(cls, file_handle):
         # fill in from file - file_handle:
         stat_from_file = cls()
         for l in file_handle:
-            fields = l.strip().split(cls._SEP) 
+            fields = l.strip().split(cls._SEP)
             if len(fields) == 0:
                 # skip empty lines:
                 continue
             if len(fields) != 2:
                 # expect two _SEP separated values per line:
                 raise _fileio.ParseError(
-                    '{} is not a valid stats file'.format(file_handle.name))
+                    "{} is not a valid stats file".format(file_handle.name)
+                )
             # extract key and value, then split the key:
-            putative_key, putative_val =  fields[0], fields[1]
+            putative_key, putative_val = fields[0], fields[1]
             key_fields = putative_key.split(cls._KEY_SEP)
             # we should impose a rigid structure of .stats or redo it:
-            if len(key_fields)==1:
+            if len(key_fields) == 1:
                 key = key_fields[0]
                 if key in stat_from_file._stat:
                     stat_from_file._stat[key] = int(fields[1])
                 else:
                     raise _fileio.ParseError(
-                        '{} is not a valid stats file: unknown field {} detected'.format(file_handle.name,key))
+                        "{} is not a valid stats file: unknown field {} detected".format(
+                            file_handle.name, key
+                        )
+                    )
             else:
                 # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup']
                 # get the first 'key' and keep the remainders in 'key_fields'
                 key = key_fields.pop(0)
-                if key in ['pair_types', 'dedup']:
+                if key in ["pair_types", "dedup"]:
                     # assert there is only one element in key_fields left:
                     # 'pair_types' and 'dedup' treated the same
                     if len(key_fields) == 1:
                         stat_from_file._stat[key][key_fields[0]] = int(fields[1])
                     else:
                         raise _fileio.ParseError(
-                            '{} is not a valid stats file: {} section implies 1 identifier'.format(file_handle.name,key))
+                            "{} is not a valid stats file: {} section implies 1 identifier".format(
+                                file_handle.name, key
+                            )
+                        )
 
-                elif key == 'chrom_freq':
+                elif key == "chrom_freq":
                     # assert remaining key_fields == [chr1, chr2]:
                     if len(key_fields) == 2:
                         stat_from_file._stat[key][tuple(key_fields)] = int(fields[1])
                     else:
                         raise _fileio.ParseError(
-                            '{} is not a valid stats file: {} section implies 2 identifiers'.format(file_handle.name,key))
+                            "{} is not a valid stats file: {} section implies 2 identifiers".format(
+                                file_handle.name, key
+                            )
+                        )
 
-                elif key == 'dist_freq':
+                elif key == "dist_freq":
                     # assert that last element of key_fields is the 'directions'
                     if len(key_fields) == 2:
                         # assert 'dirs' in ['++','--','+-','-+']
                         dirs = key_fields.pop()
                         # there is only genomic distance range of the bin that's left:
-                        bin_range, = key_fields
+                        (bin_range,) = key_fields
                         # extract left border of the bin "1000000+" or "1500-6000":
-                        dist_bin_left = (bin_range.strip('+')
-                            if bin_range.endswith('+') 
-                            else bin_range.split('-')[0])
+                        dist_bin_left = (
+                            bin_range.strip("+")
+                            if bin_range.endswith("+")
+                            else bin_range.split("-")[0]
+                        )
                         # get the index of that bin:
-                        bin_idx = np.searchsorted(
-                            stat_from_file._dist_bins, 
-                            int(dist_bin_left), 'right') - 1
+                        bin_idx = (
+                            np.searchsorted(
+                                stat_from_file._dist_bins, int(dist_bin_left), "right"
+                            )
+                            - 1
+                        )
                         # store corresponding value:
                         stat_from_file._stat[key][dirs][bin_idx] = int(fields[1])
                     else:
                         raise _fileio.ParseError(
-                            '{} is not a valid stats file: {} section implies 2 identifiers'.format(file_handle.name,key))
+                            "{} is not a valid stats file: {} section implies 2 identifiers".format(
+                                file_handle.name, key
+                            )
+                        )
                 else:
                     raise _fileio.ParseError(
-                        '{} is not a valid stats file: unknown field {} detected'.format(file_handle.name,key))
+                        "{} is not a valid stats file: unknown field {} detected".format(
+                            file_handle.name, key
+                        )
+                    )
         # return PairCounter from a non-empty dict:
         return stat_from_file
 
-
     def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
         """Gather statistics for a Hi-C pair and add to the PairCounter.
 
@@ -363,44 +434,46 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
             type of the mapped pair of reads
         """
 
-        self._stat['total'] += 1
+        self._stat["total"] += 1
         # collect pair type stats including DD:
-        self._stat['pair_types'][pair_type] = self._stat['pair_types'].get(pair_type,0) + 1
-        if chrom1 == '!' and chrom2 == '!':
-            self._stat['total_unmapped'] += 1
-        elif chrom1 != '!' and chrom2 != '!':
-            self._stat['total_mapped'] += 1
+        self._stat["pair_types"][pair_type] = (
+            self._stat["pair_types"].get(pair_type, 0) + 1
+        )
+        if chrom1 == "!" and chrom2 == "!":
+            self._stat["total_unmapped"] += 1
+        elif chrom1 != "!" and chrom2 != "!":
+            self._stat["total_mapped"] += 1
             # only mapped ones can be duplicates:
-            if pair_type == 'DD':
-                self._stat['total_dups'] += 1
+            if pair_type == "DD":
+                self._stat["total_dups"] += 1
             else:
-                self._stat['total_nodups'] += 1
-                self._stat['chrom_freq'][(chrom1, chrom2)] = (
-                    self._stat['chrom_freq'].get((chrom1, chrom2), 0) + 1)
+                self._stat["total_nodups"] += 1
+                self._stat["chrom_freq"][(chrom1, chrom2)] = (
+                    self._stat["chrom_freq"].get((chrom1, chrom2), 0) + 1
+                )
 
                 if chrom1 == chrom2:
-                    self._stat['cis'] += 1
-                    dist = np.abs(pos2-pos1)
-                    bin_idx = np.searchsorted(self._dist_bins, dist, 'right') - 1
-                    self._stat['dist_freq'][strand1+strand2][bin_idx] += 1
+                    self._stat["cis"] += 1
+                    dist = np.abs(pos2 - pos1)
+                    bin_idx = np.searchsorted(self._dist_bins, dist, "right") - 1
+                    self._stat["dist_freq"][strand1 + strand2][bin_idx] += 1
                     if dist >= 1000:
-                        self._stat['cis_1kb+'] += 1
+                        self._stat["cis_1kb+"] += 1
                     if dist >= 2000:
-                        self._stat['cis_2kb+'] += 1
+                        self._stat["cis_2kb+"] += 1
                     if dist >= 4000:
-                        self._stat['cis_4kb+'] += 1
+                        self._stat["cis_4kb+"] += 1
                     if dist >= 10000:
-                        self._stat['cis_10kb+'] += 1
+                        self._stat["cis_10kb+"] += 1
                     if dist >= 20000:
-                        self._stat['cis_20kb+'] += 1
+                        self._stat["cis_20kb+"] += 1
                     if dist >= 40000:
-                        self._stat['cis_40kb+'] += 1
+                        self._stat["cis_40kb+"] += 1
 
                 else:
-                    self._stat['trans'] += 1
+                    self._stat["trans"] += 1
         else:
-            self._stat['total_single_sided_mapped'] += 1
-
+            self._stat["total_single_sided_mapped"] += 1
 
     def __add__(self, other):
         # both PairCounter are implied to have a list of common fields:
@@ -412,39 +485,41 @@ def __add__(self, other):
         # initialize empty PairCounter for the result of summation:
         sum_stat = PairCounter()
         # use the empty PairCounter to iterate over:
-        for k,v in sum_stat._stat.items():
+        for k, v in sum_stat._stat.items():
             # not nested fields are summed trivially:
             if isinstance(v, int):
                 sum_stat._stat[k] = self._stat[k] + other._stat[k]
             # sum nested dicts/arrays in a context dependet manner:
             else:
-                if k in ['pair_types','dedup']:
+                if k in ["pair_types", "dedup"]:
                     # handy function for summation of a pair of dicts:
                     # https://stackoverflow.com/questions/10461531/merge-and-sum-of-two-dictionaries
-                    sum_dicts = lambda dict_x,dict_y: { 
-                                    key: dict_x.get(key, 0) + dict_y.get(key, 0)
-                                            for key in set(dict_x) | set(dict_y) }
+                    sum_dicts = lambda dict_x, dict_y: {
+                        key: dict_x.get(key, 0) + dict_y.get(key, 0)
+                        for key in set(dict_x) | set(dict_y)
+                    }
                     # sum a pair of corresponding dicts:
                     sum_stat._stat[k] = sum_dicts(self._stat[k], other._stat[k])
-                if k == 'chrom_freq':
+                if k == "chrom_freq":
                     # union list of keys (chr1,chr2) with potential duplicates:
-                    union_keys_with_dups = list(self._stat[k].keys()) + list(other._stat[k].keys())
+                    union_keys_with_dups = list(self._stat[k].keys()) + list(
+                        other._stat[k].keys()
+                    )
                     # OrderedDict.fromkeys will take care of keys' order and duplicates in a consistent manner:
                     # https://stackoverflow.com/questions/1720421/how-to-concatenate-two-lists-in-python
                     # last comment to the 3rd Answer
-                    sum_stat._stat[k] = OrderedDict.fromkeys( union_keys_with_dups )
+                    sum_stat._stat[k] = OrderedDict.fromkeys(union_keys_with_dups)
                     # perform a summation:
                     for union_key in sum_stat._stat[k]:
-                        sum_stat._stat[k][union_key] = (
-                            self._stat[k].get(union_key, 0) 
-                            + other._stat[k].get(union_key, 0)
-                        )
-                if k == 'dist_freq':
+                        sum_stat._stat[k][union_key] = self._stat[k].get(
+                            union_key, 0
+                        ) + other._stat[k].get(union_key, 0)
+                if k == "dist_freq":
                     for dirs in sum_stat[k]:
                         sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs]
+        self.calculate_summaries()
         return sum_stat
 
-
     # we need this to be able to sum(list_of_PairCounters)
     def __radd__(self, other):
         if other == 0:
@@ -452,50 +527,54 @@ def __radd__(self, other):
         else:
             return self.__add__(other)
 
-
     def flatten(self):
-        """return a flattened OrderedDic (formatted same way as .stats file)
-
-        """
+        """return a flattened OrderedDic (formatted same way as .stats file)"""
         # OrderedDict for flat store:
         flat_stat = OrderedDict()
 
         # Storing statistics
-        for k,v in self._stat.items():
+        for k, v in self._stat.items():
             if isinstance(v, int):
                 flat_stat[k] = v
             # store nested dicts/arrays in a context dependet manner:
             # nested categories are stored only if they are non-trivial
             else:
-                if (k == 'dist_freq') and v:
+                if (k == "dist_freq") and v:
                     for i in range(len(self._dist_bins)):
                         for dirs, freqs in v.items():
                             # last bin is treated differently: "100000+" vs "1200-3000":
                             if i != len(self._dist_bins) - 1:
-                                formatted_key = self._KEY_SEP.join(['{}','{}-{}','{}']).format(
-                                                k, self._dist_bins[i], self._dist_bins[i+1], dirs)
+                                formatted_key = self._KEY_SEP.join(
+                                    ["{}", "{}-{}", "{}"]
+                                ).format(
+                                    k, self._dist_bins[i], self._dist_bins[i + 1], dirs
+                                )
                             else:
-                                formatted_key = self._KEY_SEP.join(['{}','{}+','{}']).format(
-                                                k, self._dist_bins[i], dirs)
-                            #store key,value pair:
+                                formatted_key = self._KEY_SEP.join(
+                                    ["{}", "{}+", "{}"]
+                                ).format(k, self._dist_bins[i], dirs)
+                            # store key,value pair:
                             flat_stat[formatted_key] = freqs[i]
-                elif (k in ['pair_types','dedup']) and v:
-                    # 'pair_types' and 'dedup' are simple dicts inside,
+                elif (k in ["pair_types", "dedup", "summary"]) and v:
+                    # 'pair_types', 'dedup' and 'summary' are simple dicts inside,
                     # treat them the exact same way:
                     for k_item, freq in v.items():
-                        formatted_key = self._KEY_SEP.join(['{}','{}']).format(k, k_item)
-                        #store key,value pair:
+                        formatted_key = self._KEY_SEP.join(["{}", "{}"]).format(
+                            k, k_item
+                        )
+                        # store key,value pair:
                         flat_stat[formatted_key] = freq
-                elif (k == 'chrom_freq') and v:
+                elif (k == "chrom_freq") and v:
                     for (chrom1, chrom2), freq in v.items():
-                        formatted_key = self._KEY_SEP.join(['{}','{}','{}']).format(k, chrom1, chrom2)
-                        #store key,value pair:
+                        formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format(
+                            k, chrom1, chrom2
+                        )
+                        # store key,value pair:
                         flat_stat[formatted_key] = freq
 
         # return flattened OrderedDict
         return flat_stat
 
-
     def save(self, outstream):
         """save PairCounter to tab-delimited text file.
         Flattened version of PairCounter is stored in the file.
@@ -516,10 +595,9 @@ def save(self, outstream):
         """
 
         # write flattened version of the PairCounter to outstream
-        for k,v in self.flatten().items():
-            outstream.write('{}{}{}\n'.format(k, self._SEP, v))
+        for k, v in self.flatten().items():
+            outstream.write("{}{}{}\n".format(k, self._SEP, v))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     stats()
-

From 99678bb0b35f770e4b064a0344a27b0b7867e9a9 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Tue, 15 Jun 2021 13:09:01 +0100
Subject: [PATCH 02/20] Fix stats test

---
 tests/test_stats.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/test_stats.py b/tests/test_stats.py
index a9a10674..b6691369 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -27,7 +27,10 @@ def test_mock_pairsam():
                 if not l.startswith('#') and l.strip())
 
     for k in stats:
-        stats[k] = int(stats[k])
+        try:
+            stats[k] = int(stats[k])
+        except ValueError:
+            stats[k] = float(stats[k])
     print(stats)
 
     assert stats['total'] == 8
@@ -53,4 +56,11 @@ def test_mock_pairsam():
     assert stats['dist_freq/1-2/++'] == 1
     assert stats['dist_freq/2-3/++'] == 1
     assert stats['dist_freq/32-56/++'] == 1
+    assert stats['summary/frac_cis'] == 0.6
+    assert stats['summary/frac_cis_1kb+'] == 0
+    assert stats['summary/frac_cis_2kb+'] == 0
+    assert stats['summary/frac_cis_4kb+'] == 0
+    assert stats['summary/frac_cis_10kb+'] == 0
+    assert stats['summary/frac_cis_20kb+'] == 0
+    assert stats['summary/frac_cis_40kb+'] == 0    
 

From 6e202cb4b2a137ee3aae676fd232d7f24fbfc1f2 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Tue, 15 Jun 2021 13:16:58 +0100
Subject: [PATCH 03/20] Fix merge

---
 pairtools/pairtools_stats.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 2064d71a..fe0d130e 100644
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -350,14 +350,17 @@ def from_file(cls, file_handle):
                         )
                     )
             else:
-                # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup']
+                # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup', 'summary']
                 # get the first 'key' and keep the remainders in 'key_fields'
                 key = key_fields.pop(0)
-                if key in ["pair_types", "dedup"]:
+                if key in ["pair_types", "dedup", 'summary']:
                     # assert there is only one element in key_fields left:
                     # 'pair_types' and 'dedup' treated the same
                     if len(key_fields) == 1:
-                        stat_from_file._stat[key][key_fields[0]] = int(fields[1])
+                        try:
+                            stat_from_file._stat[key][key_fields[0]] = int(fields[1])
+                        except ValueError:
+                            stat_from_file._stat[key][key_fields[0]] = float(fields[1])
                     else:
                         raise _fileio.ParseError(
                             "{} is not a valid stats file: {} section implies 1 identifier".format(
@@ -517,7 +520,7 @@ def __add__(self, other):
                 if k == "dist_freq":
                     for dirs in sum_stat[k]:
                         sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs]
-        self.calculate_summaries()
+        sum_stat.calculate_summaries()
         return sum_stat
 
     # we need this to be able to sum(list_of_PairCounters)

From 96c2e48da7a4cac29da9ecc68f4e7f2493525b17 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Fri, 22 Apr 2022 15:46:50 +0200
Subject: [PATCH 04/20] Add functions for duplication tile and complexity

---
 pairtools/pairtools_stats.py | 95 +++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 7 deletions(-)

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 350aa878..7fdc2e7a 100755
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from scipy import special
 
 from collections.abc import Mapping
 
@@ -131,11 +132,87 @@ def do_merge(output, files_to_merge, **kwargs):
         outstream.close()
 
 
-def complexity_estimate(total_mapped, total_dups):
-    try:
-        return total_mapped ** 2 / total_dups / 2
-    except ZeroDivisionError:
-        return 0
+def estimate_library_complexity(nseq, ndup, nopticaldup=0):
+    """Estimate library complexity accounting for optical/clustering duplicates
+
+    Parameters
+    ----------
+    nseq : int
+        Total number of sequences
+    ndup : int
+        Total number of duplicates
+    nopticaldup : int, optional
+        Number of non-PCR duplicates, by default 0
+
+    Returns
+    -------
+    float
+        Estimated complexity
+    """
+    nseq = nseq - nopticaldup
+    ndup = ndup - nopticaldup
+    u = (nseq - ndup) / nseq
+    seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u
+    complexity = nseq / seq_to_complexity
+    return complexity
+
+
+def extract_tile_info(series, regex=False):
+    """Extract the name of the tile for each read name in the series
+
+    Parameters
+    ----------
+    series : pd.Series
+        Series containing read IDs
+    regex : bool, optional
+        Regex to extract fields from the read IDs that correspond to tile IDs.
+        By default False, uses a faster predefined approach for typical Illumina
+        read names
+        Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)"
+
+    Returns
+    -------
+    Series
+        Series containing tile IDs as strings
+    """
+    if regex:
+        split = series.str.extractall(regex).unstack().droplevel(1, axis=1)
+        return split[0] + ":" + split[1] + ":" + split[2]
+    else:
+        split = series.str.split(":", expand=True)
+        return split[2] + ":" + split[3] + ":" + split[4]
+
+
+def analyse_duplicate_stats(dups, tile_dup_regex=False):
+    """_summary_
+
+    Parameters
+    ----------
+    dups : pd.DataFrame
+        Dataframe with duplicates that contains pared read IDs
+    tile_dup_regex : bool, optional
+        See extract_tile_info for details, by default False
+
+    Returns
+    -------
+    pd.DataFrame
+        Grouped multi-indexed dataframe of pairwise by-tile duplication counts
+    """
+    dups = dups.copy()
+    dups["tile"] = extract_tile_info(dups["readID"])
+    dups["parent_tile"] = extract_tile_info(dups["parent_readID"])
+    dups["same_tile"] = dups["tile"] == dups["parent_tile"]
+    bytile_dups = (
+        dups.groupby(["tile", "parent_tile"])
+        .size()
+        .reset_index(name="dup_count")
+        .sort_values(["tile", "parent_tile"])
+    )
+    bytile_dups[["tile", "parent_tile"]] = np.sort(
+        bytile_dups[["tile", "parent_tile"]].values, axis=1
+    )
+    bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum()
+    return bytile_dups
 
 
 class PairCounter(Mapping):
@@ -210,7 +287,7 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
                 ("frac_cis_10kb+", 0),
                 ("frac_cis_20kb+", 0),
                 ("frac_cis_40kb+", 0),
-                ("complexity", 0),
+                ("naive_complexity", 0),
             ]
         )
 
@@ -306,7 +383,7 @@ def calculate_summaries(self):
             self._stat["summary"][f"frac_{cis_count}"] = (
                 self._stat[cis_count] / self._stat["total_nodups"]
             )
-        self._stat["summary"]["complexity"] = complexity_estimate(
+        self._stat["summary"]["naive_complexity"] = estimate_library_complexity(
             self._stat["total_mapped"], self._stat["total_dups"]
         )
 
@@ -594,6 +671,10 @@ def __add__(self, other):
                     for dirs in sum_stat[k]:
                         sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs]
         sum_stat.calculate_summaries()
+        sum_stat["summary"]["naive_complexity"] = (
+            self._stat["summary"]["naive_complexity"]
+            + other._stat["summary"]["naive_complexity"]
+        )
         return sum_stat
 
     # we need this to be able to sum(list_of_PairCounters)

From c05e438354426ed927cc3b9874f2839fab1bc5a1 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Fri, 22 Apr 2022 16:16:38 +0200
Subject: [PATCH 05/20] Towards duplicate statistics

---
 pairtools/pairtools_stats.py | 63 ++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 7fdc2e7a..060d81dd 100755
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -3,6 +3,7 @@
 import io
 import sys
 import click
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -26,8 +27,15 @@
     " all overlapping statistics. Non-overlapping statistics are appended to"
     " the end of the file.",
 )
+@click.option(
+    "--by-tile-dups",
+    is_flag=True,
+    help="If specified, will analyse by-tile duplication statistics to estimate"
+    " library complexity more accurately. Requires parent_readID column to be saved"
+    " by dedup (will ignore this option otherwise)",
+)
 @common_io_options
-def stats(input_path, output, merge, **kwargs):
+def stats(input_path, output, merge, analyse_by_tile_duplication, **kwargs):
     """Calculate pairs statistics.
 
     INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
@@ -37,10 +45,10 @@ def stats(input_path, output, merge, **kwargs):
 
     The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
     """
-    stats_py(input_path, output, merge, **kwargs)
+    stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs)
 
 
-def stats_py(input_path, output, merge, **kwargs):
+def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs):
     if merge:
         do_merge(output, input_path, **kwargs)
         return
@@ -69,25 +77,24 @@ def stats_py(input_path, output, merge, **kwargs):
     header, body_stream = _headerops.get_header(instream)
     cols = _headerops.extract_column_names(header)
 
+    if analyse_by_tile_duplication and "parent_readID" not in cols:
+        warnings.warn(
+            "No 'parent_readID' column in the file, not generating duplicate" " stats."
+        )
+        analyse_by_tile_duplication = False
+    elif analyse_by_tile_duplication:
+        bytile_dups = pd.DataFrame([])
     # new stats class stuff would come here ...
     stats = PairCounter()
 
     # Collecting statistics
-    for line in body_stream:
-        cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)
-
-        # algn1:
-        chrom1 = cols[_pairsam_format.COL_C1]
-        pos1 = int(cols[_pairsam_format.COL_P1])
-        strand1 = cols[_pairsam_format.COL_S1]
-        # algn2:
-        chrom2 = cols[_pairsam_format.COL_C2]
-        pos2 = int(cols[_pairsam_format.COL_P2])
-        strand2 = cols[_pairsam_format.COL_S2]
-        # pair type:
-        pair_type = cols[_pairsam_format.COL_PTYPE]
-
-        stats.add_pair(chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type)
+    for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):
+        stats.add_pairs_from_dataframe(chunk)
+        if analyse_by_tile_duplication:
+            dups = chunk.iloc[chunk["duplicate"]]
+            bytile_dups.add(analyse_duplicate_stats(dups), fill_value=0).astype(int)
+    dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0]
+    stats._stat["total_dups_by_tile_median"] = dups_by_tile_median
     stats.calculate_summaries()
 
     # save statistics to file ...
@@ -383,9 +390,17 @@ def calculate_summaries(self):
             self._stat["summary"][f"frac_{cis_count}"] = (
                 self._stat[cis_count] / self._stat["total_nodups"]
             )
-        self._stat["summary"]["naive_complexity"] = estimate_library_complexity(
-            self._stat["total_mapped"], self._stat["total_dups"]
+        self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
+            self._stat["total_mapped"], self._stat["total_dups"], 0
         )
+        if "dups_by_tile_median" in self._stat:
+            self._stat["summary"][
+                "complexity_dups_by_tile_median"
+            ] = estimate_library_complexity(
+                self._stat["total_mapped"],
+                self._stat["total_dups"],
+                self._stat["dups_by_tile_median"],
+            )
 
     @classmethod
     def from_file(cls, file_handle):
@@ -675,6 +690,14 @@ def __add__(self, other):
             self._stat["summary"]["naive_complexity"]
             + other._stat["summary"]["naive_complexity"]
         )
+        if (
+            "complexity_dups_by_tile_median" in self._stat
+            and "complexity_dups_by_tile_median" in other._stat
+        ):
+            sum_stat["summary"]["complexity_dups_by_tile_median"] = (
+                self._stat["summary"]["complexity_dups_by_tile_median"]
+                + other._stat["summary"]["complexity_dups_by_tile_median"]
+            )
         return sum_stat
 
     # we need this to be able to sum(list_of_PairCounters)

From a857a1bbadd622323326f11f62ad42267d09b073 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Mon, 25 Apr 2022 14:12:01 +0200
Subject: [PATCH 06/20] Fix by-chrom stats to ignore dups

---
 pairtools/pairtools_stats.py | 31 ++++++++------
 tests/data/mock.stats.pairs  | 21 +++++++++
 tests/test_stats.py          | 82 ++++++++++++++++++------------------
 3 files changed, 82 insertions(+), 52 deletions(-)
 create mode 100644 tests/data/mock.stats.pairs

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 060d81dd..5d467c11 100755
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -28,7 +28,7 @@
     " the end of the file.",
 )
 @click.option(
-    "--by-tile-dups",
+    "--analyse-by-tile-duplication",
     is_flag=True,
     help="If specified, will analyse by-tile duplication statistics to estimate"
     " library complexity more accurately. Requires parent_readID column to be saved"
@@ -93,8 +93,9 @@ def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs):
         if analyse_by_tile_duplication:
             dups = chunk.iloc[chunk["duplicate"]]
             bytile_dups.add(analyse_duplicate_stats(dups), fill_value=0).astype(int)
-    dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0]
-    stats._stat["total_dups_by_tile_median"] = dups_by_tile_median
+    if analyse_by_tile_duplication:
+        dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0]
+        stats._stat["total_dups_by_tile_median"] = dups_by_tile_median
     stats.calculate_summaries()
 
     # save statistics to file ...
@@ -191,7 +192,7 @@ def extract_tile_info(series, regex=False):
 
 
 def analyse_duplicate_stats(dups, tile_dup_regex=False):
-    """_summary_
+    """Count by-tile duplicates
 
     Parameters
     ----------
@@ -294,7 +295,8 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
                 ("frac_cis_10kb+", 0),
                 ("frac_cis_20kb+", 0),
                 ("frac_cis_40kb+", 0),
-                ("naive_complexity", 0),
+                ("frac_dups", 0),
+                ("complexity_naive", 0),
             ]
         )
 
@@ -390,6 +392,9 @@ def calculate_summaries(self):
             self._stat["summary"][f"frac_{cis_count}"] = (
                 self._stat[cis_count] / self._stat["total_nodups"]
             )
+        self._stat["summary"]["frac_dups"] = (
+            self._stat["total_dups"] / self._stat["total_mapped"]
+        )
         self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
             self._stat["total_mapped"], self._stat["total_dups"], 0
         )
@@ -615,18 +620,20 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
         self._stat["total_dups"] += int(dups_count)
         self._stat["total_nodups"] += int(mapped_count - dups_count)
 
+        df_nodups = df_mapped.loc[~mask_dups, :]
+        mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"]
+        df_cis = df_nodups.loc[mask_cis, :].copy()
+
         # Count pairs per chromosome:
         for (chrom1, chrom2), chrom_count in (
-            df_mapped[["chrom1", "chrom2"]].value_counts().items()
+            df_nodups[["chrom1", "chrom2"]].value_counts().items()
         ):
             self._stat["chrom_freq"][(chrom1, chrom2)] = (
                 self._stat["chrom_freq"].get((chrom1, chrom2), 0) + chrom_count
             )
 
         # Count cis-trans by pairs:
-        df_nodups = df_mapped.loc[~mask_dups, :]
-        mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"]
-        df_cis = df_nodups.loc[mask_cis, :].copy()
+
         self._stat["cis"] += df_cis.shape[0]
         self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0]
         dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values)
@@ -686,9 +693,9 @@ def __add__(self, other):
                     for dirs in sum_stat[k]:
                         sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs]
         sum_stat.calculate_summaries()
-        sum_stat["summary"]["naive_complexity"] = (
-            self._stat["summary"]["naive_complexity"]
-            + other._stat["summary"]["naive_complexity"]
+        sum_stat["summary"]["complexity_naive"] = (
+            self._stat["summary"]["complexity_naive"]
+            + other._stat["summary"]["complexity_naive"]
         )
         if (
             "complexity_dups_by_tile_median" in self._stat
diff --git a/tests/data/mock.stats.pairs b/tests/data/mock.stats.pairs
new file mode 100644
index 00000000..4d6bd7fa
--- /dev/null
+++ b/tests/data/mock.stats.pairs
@@ -0,0 +1,21 @@
+## pairs format v1.0.0
+#shape: upper triangle
+#genome_assembly: unknown
+#samheader: @SQ	SN:chr1	LN:100
+#samheader: @SQ	SN:chr2	LN:100
+#samheader: @SQ	SN:chr3	LN:100
+#samheader: @PG	ID:bwa	PN:bwa	VN:0.7.15-r1140	CL:bwa mem -SP /path/ucsc.hg19.fasta.gz /path/1.fastq.gz /path/2.fastq.gz
+#chromosomes: chr2 chr3 chr1
+#chromsize: chr2 100
+#chromsize: chr3 100
+#chromsize: chr1 100
+#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type
+readid01	chr1	1	chr2	20	+	+	UU
+readid02	chr1	1	chr1	50	+	+	UU
+readid03	chr1	1	chr1	50	+	+	DD
+readid04	chr1	1	chr1	2	+	+	UU
+readid05	chr1	1	chr1	3	+	+	UR
+readid06	chr2	1	chr3	2	+	+	UU
+readid07	!	0	chr1	3	-	+	NU
+readid08	!	0	chr1	3	-	+	MU
+readid09	!	0	!	0	-	-	WW
diff --git a/tests/test_stats.py b/tests/test_stats.py
index b6691369..2f906569 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -3,28 +3,27 @@
 import sys
 import subprocess
 from nose.tools import assert_raises
+import numpy as np
 
 testdir = os.path.dirname(os.path.realpath(__file__))
 
+
 def test_mock_pairsam():
-    mock_pairsam_path = os.path.join(testdir, 'data', 'mock.pairsam')
+    mock_pairsam_path = os.path.join(testdir, "data", "mock.stats.pairs")
     try:
         result = subprocess.check_output(
-            ['python',
-             '-m',
-             'pairtools',
-             'stats',
-             mock_pairsam_path],
-            ).decode('ascii')
+            ["python", "-m", "pairtools", "stats", mock_pairsam_path],
+        ).decode("ascii")
     except subprocess.CalledProcessError as e:
         print(e.output)
         print(sys.exc_info())
         raise e
 
-        
-    stats = dict(l.strip().split('\t') 
-                 for l in result.split('\n')
-                if not l.startswith('#') and l.strip())
+    stats = dict(
+        l.strip().split("\t")
+        for l in result.split("\n")
+        if not l.startswith("#") and l.strip()
+    )
 
     for k in stats:
         try:
@@ -33,34 +32,37 @@ def test_mock_pairsam():
             stats[k] = float(stats[k])
     print(stats)
 
-    assert stats['total'] == 8
-    assert stats['total_single_sided_mapped'] == 2
-    assert stats['total_mapped'] == 5
-    assert stats['cis'] == 3
-    assert stats['trans'] == 2
-    assert stats['pair_types/UU'] == 4
-    assert stats['pair_types/NU'] == 1
-    assert stats['pair_types/WW'] == 1
-    assert stats['pair_types/UR'] == 1
-    assert stats['pair_types/MU'] == 1
-    assert stats['chrom_freq/chr1/chr2'] ==  1
-    assert stats['chrom_freq/chr1/chr1'] ==  3
-    assert stats['chrom_freq/chr2/chr3'] ==  1
-    assert all(stats[k]==0 
-               for k in stats
-               if k.startswith('dist_freq')
-               and k not in ['dist_freq/1-2/++',
-                             'dist_freq/2-3/++',
-                             'dist_freq/32-56/++'])
+    assert stats["total"] == 9
+    assert stats["total_single_sided_mapped"] == 2
+    assert stats["total_mapped"] == 6
+    assert stats["total_dups"] == 1
+    assert stats["cis"] == 3
+    assert stats["trans"] == 2
+    assert stats["pair_types/UU"] == 4
+    assert stats["pair_types/NU"] == 1
+    assert stats["pair_types/WW"] == 1
+    assert stats["pair_types/UR"] == 1
+    assert stats["pair_types/MU"] == 1
+    assert stats["pair_types/DD"] == 1
+    assert stats["chrom_freq/chr1/chr2"] == 1
+    assert stats["chrom_freq/chr1/chr1"] == 3
+    assert stats["chrom_freq/chr2/chr3"] == 1
+    assert all(
+        stats[k] == 0
+        for k in stats
+        if k.startswith("dist_freq")
+        and k not in ["dist_freq/1-2/++", "dist_freq/2-3/++", "dist_freq/32-56/++"]
+    )
 
-    assert stats['dist_freq/1-2/++'] == 1
-    assert stats['dist_freq/2-3/++'] == 1
-    assert stats['dist_freq/32-56/++'] == 1
-    assert stats['summary/frac_cis'] == 0.6
-    assert stats['summary/frac_cis_1kb+'] == 0
-    assert stats['summary/frac_cis_2kb+'] == 0
-    assert stats['summary/frac_cis_4kb+'] == 0
-    assert stats['summary/frac_cis_10kb+'] == 0
-    assert stats['summary/frac_cis_20kb+'] == 0
-    assert stats['summary/frac_cis_40kb+'] == 0    
+    assert stats["dist_freq/1-2/++"] == 1
+    assert stats["dist_freq/2-3/++"] == 1
+    assert stats["dist_freq/32-56/++"] == 1
+    assert stats["summary/frac_cis"] == 0.6
+    assert stats["summary/frac_cis_1kb+"] == 0
+    assert stats["summary/frac_cis_2kb+"] == 0
+    assert stats["summary/frac_cis_4kb+"] == 0
+    assert stats["summary/frac_cis_10kb+"] == 0
+    assert stats["summary/frac_cis_20kb+"] == 0
+    assert stats["summary/frac_cis_40kb+"] == 0
+    assert np.isclose(stats["summary/frac_dups"], 1 / 6)
 

From 30014383499dc4dc0d3e0e4516a98b7ec155fee0 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Mon, 25 Apr 2022 14:13:50 +0200
Subject: [PATCH 07/20] Rename mock4stats

---
 tests/data/{mock.stats.pairs => mock.4stats.pairs} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/data/{mock.stats.pairs => mock.4stats.pairs} (100%)

diff --git a/tests/data/mock.stats.pairs b/tests/data/mock.4stats.pairs
similarity index 100%
rename from tests/data/mock.stats.pairs
rename to tests/data/mock.4stats.pairs

From 1a614e7dac797f2de75ae5615270975718d0fff0 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Tue, 26 Apr 2022 15:36:50 +0200
Subject: [PATCH 08/20] Make dedup stats!

---
 pairtools/pairtools_dedup.py | 57 ++++++++++++++----------
 pairtools/pairtools_stats.py | 86 +++++++++++++++++++++++++++---------
 tests/test_stats.py          |  2 +-
 3 files changed, 101 insertions(+), 44 deletions(-)

diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py
index c2f472d0..cca5ce89 100644
--- a/pairtools/pairtools_dedup.py
+++ b/pairtools/pairtools_dedup.py
@@ -70,7 +70,15 @@
     " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
     " By default, statistics are not printed.",
 )
-
+@click.option(
+    "--output-bytile-dups-stats",
+    type=str,
+    default="",
+    help="output file for duplicate statistics."
+    " If file exists, it will be open in the append mode."
+    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
+    " By default, by-tile duplicate statistics are not printed.",
+)
 ### Set the dedup method:
 @click.option(
     "--max-mismatch",
@@ -233,6 +241,7 @@ def dedup(
     output_dups,
     output_unmapped,
     output_stats,
+    output_bytile_dups_stats,
     chunksize,
     carryover,
     max_mismatch,
@@ -270,6 +279,7 @@ def dedup(
         output_dups,
         output_unmapped,
         output_stats,
+        output_bytile_dups_stats,
         chunksize,
         carryover,
         max_mismatch,
@@ -303,6 +313,7 @@ def dedup_py(
     output_dups,
     output_unmapped,
     output_stats,
+    output_bytile_dups_stats,
     chunksize,
     carryover,
     max_mismatch,
@@ -360,6 +371,17 @@ def dedup_py(
         else None
     )
 
+    out_bytile_dups_stats_stream = (
+        _fileio.auto_open(
+            output_bytile_dups_stats,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+        if output_bytile_dups_stats
+        else None
+    )
+
     # generate empty PairCounter if stats output is requested:
     out_stat = PairCounter() if output_stats else None
 
@@ -401,8 +423,8 @@ def dedup_py(
         outstream.writelines((l + "\n" for l in header))
     if send_header_to_dup and outstream_dups and (outstream_dups != outstream):
         dups_header = header
-        if keep_parent_id and len(dups_header)>0:
-            dups_header= _headerops.append_columns(dups_header, ["parent_readID"])
+        if keep_parent_id and len(dups_header) > 0:
+            dups_header = _headerops.append_columns(dups_header, ["parent_readID"])
         outstream_dups.writelines((l + "\n" for l in dups_header))
     if (
         outstream_unmapped
@@ -475,6 +497,9 @@ def dedup_py(
     if out_stat:
         out_stat.save(out_stats_stream)
 
+    if output_bytile_dups_stats:
+        out_stat.save_bytile_dups(out_bytile_dups_stats_stream)
+
     if instream != sys.stdin:
         instream.close()
 
@@ -495,7 +520,7 @@ def dedup_py(
         out_stats_stream.close()
 
 
-### Cython deduplication ####
+### Chunked dedup ###
 def streaming_dedup(
     in_stream,
     colnames,
@@ -531,15 +556,14 @@ def streaming_dedup(
         n_proc=n_proc,
     )
 
-    t0 = time.time()
-    N = 0
-
     for df_chunk in deduped_chunks:
-        N += df_chunk.shape[0]
-
         # Write the stats if requested:
         if out_stat is not None:
-            out_stat.add_pairs_from_dataframe(df_chunk, unmapped_chrom=unmapped_chrom)
+            out_stat.add_pairs_from_dataframe(
+                df_chunk,
+                unmapped_chrom=unmapped_chrom,
+                analyse_bytile_dups=keep_parent_id,
+            )
 
         # Define masks of unmapped and duplicated reads:
         mask_mapped = np.logical_and(
@@ -572,11 +596,6 @@ def streaming_dedup(
             outstream, index=False, header=False, sep="\t"
         )
 
-    t1 = time.time()
-    t = t1 - t0
-    print(f"total time: {t}")
-    print(f"time per mln pairs: {t/N*1e6}")
-
 
 def _dedup_stream(
     in_stream,
@@ -838,9 +857,6 @@ def streaming_dedup_cython(
     strandDict = {}
     curMaxLen = max(MAX_LEN, dd.getLen())
 
-    t0 = time.time()
-    N = 0
-
     instream = iter(instream)
     read_idx = 0  # read index to mark the parent readID
     while True:
@@ -900,7 +916,6 @@ def streaming_dedup_cython(
                 else:
                     s1.append(fetchadd(cols[s1ind], strandDict))
                     s2.append(fetchadd(cols[s2ind], strandDict))
-            N += 1
         if (not stripline) or (len(c1) == curMaxLen):
             if keep_parent_id:
                 res, parents = dd.push(
@@ -997,10 +1012,6 @@ def streaming_dedup_cython(
 
     # all lines have been processed at this point.
     # streaming_dedup is over.
-    t1 = time.time()
-    t = t1 - t0
-    print(f"total time: {t}")
-    print(f"time per mln pairs: {t/N*1e6}")
 
 
 def fetchadd(key, mydict):
diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 5d467c11..1c8682d8 100755
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -28,14 +28,25 @@
     " the end of the file.",
 )
 @click.option(
-    "--analyse-by-tile-duplication",
-    is_flag=True,
+    "--analyse-bytile-dups",
+    type=str,
+    default="",
     help="If specified, will analyse by-tile duplication statistics to estimate"
-    " library complexity more accurately. Requires parent_readID column to be saved"
-    " by dedup (will ignore this option otherwise)",
+    " library complexity more accurately."
+    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
+)
+@click.option(
+    "--output-bytile-dups-stats",
+    type=str,
+    default="",
+    help="If specified, will analyse by-tile duplication statistics to estimate"
+    " library complexity more accurately and will save details to this path."
+    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
 )
 @common_io_options
-def stats(input_path, output, merge, analyse_by_tile_duplication, **kwargs):
+def stats(
+    input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs
+):
     """Calculate pairs statistics.
 
     INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
@@ -45,10 +56,19 @@ def stats(input_path, output, merge, analyse_by_tile_duplication, **kwargs):
 
     The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
     """
-    stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs)
+    stats_py(
+        input_path,
+        output,
+        merge,
+        analyse_bytile_dups,
+        output_bytile_dups_stats,
+        **kwargs,
+    )
 
 
-def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs):
+def stats_py(
+    input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs
+):
     if merge:
         do_merge(output, input_path, **kwargs)
         return
@@ -77,25 +97,23 @@ def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs):
     header, body_stream = _headerops.get_header(instream)
     cols = _headerops.extract_column_names(header)
 
-    if analyse_by_tile_duplication and "parent_readID" not in cols:
+    if (
+        analyse_bytile_dups or output_bytile_dups_stats
+    ) and "parent_readID" not in cols:
         warnings.warn(
-            "No 'parent_readID' column in the file, not generating duplicate" " stats."
+            "No 'parent_readID' column in the file, not generating duplicate stats."
         )
-        analyse_by_tile_duplication = False
-    elif analyse_by_tile_duplication:
-        bytile_dups = pd.DataFrame([])
+        analyse_bytile_dups = False
+        output_bytile_dups_stats = False
     # new stats class stuff would come here ...
     stats = PairCounter()
 
     # Collecting statistics
     for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):
         stats.add_pairs_from_dataframe(chunk)
-        if analyse_by_tile_duplication:
-            dups = chunk.iloc[chunk["duplicate"]]
-            bytile_dups.add(analyse_duplicate_stats(dups), fill_value=0).astype(int)
-    if analyse_by_tile_duplication:
-        dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0]
-        stats._stat["total_dups_by_tile_median"] = dups_by_tile_median
+
+    if output_bytile_dups_stats:
+        stats.save_bytile_dups()
     stats.calculate_summaries()
 
     # save statistics to file ...
@@ -299,6 +317,11 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
                 ("complexity_naive", 0),
             ]
         )
+        self.bytile_dups = pd.DataFrame(
+            index=pd.MultiIndex(
+                levels=[[], []], codes=[[], []], names=["tile", "parent_tile"]
+            )
+        )
 
     def __getitem__(self, key):
         if isinstance(key, str):
@@ -398,6 +421,10 @@ def calculate_summaries(self):
         self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
             self._stat["total_mapped"], self._stat["total_dups"], 0
         )
+        if self.bytile_dups.shape[0] > 0:
+            self._stat["dups_by_tile_median"] = (
+                self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0]
+            )
         if "dups_by_tile_median" in self._stat:
             self._stat["summary"][
                 "complexity_dups_by_tile_median"
@@ -575,7 +602,9 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
         else:
             self._stat["total_single_sided_mapped"] += 1
 
-    def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
+    def add_pairs_from_dataframe(
+        self, df, unmapped_chrom="!", analyse_bytile_dups=False
+    ):
         """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter.
     
         Parameters
@@ -616,7 +645,8 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
             mask_dups = df_mapped["duplicate"]
         else:
             mask_dups = df_mapped["pair_type"] == "DD"
-        dups_count = mask_dups.sum()
+        dups = df_mapped[mask_dups]
+        dups_count = dups.shape[0]
         self._stat["total_dups"] += int(dups_count)
         self._stat["total_nodups"] += int(mapped_count - dups_count)
 
@@ -650,6 +680,12 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
         self._stat["cis_20kb+"] += int(np.sum(dist >= 20000))
         self._stat["cis_40kb+"] += int(np.sum(dist >= 40000))
 
+        ### Add by-tile dups
+        if analyse_bytile_dups and dups.shape[0] > 0:
+            self.bytile_dups = self.bytile_dups.add(
+                analyse_duplicate_stats(dups), fill_value=0
+            ).astype(int)
+
     def __add__(self, other):
         # both PairCounter are implied to have a list of common fields:
         #
@@ -705,6 +741,7 @@ def __add__(self, other):
                 self._stat["summary"]["complexity_dups_by_tile_median"]
                 + other._stat["summary"]["complexity_dups_by_tile_median"]
             )
+        # self.bytile_dups.add(other.bytile_dups, fill_value=0).astype(int)
         return sum_stat
 
     # we need this to be able to sum(list_of_PairCounters)
@@ -789,6 +826,15 @@ def save(self, outstream):
         for k, v in self.flatten().items():
             outstream.write("{}{}{}\n".format(k, self._SEP, v))
 
+    def save_bytile_dups(self, outstream):
+        """save bytile duplication counts to a tab-delimited text file.
+
+        Parameters
+        ----------
+        outstream: file handle
+        """
+        self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False)
+
 
 if __name__ == "__main__":
     stats()
diff --git a/tests/test_stats.py b/tests/test_stats.py
index 2f906569..9a516bb7 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -9,7 +9,7 @@
 
 
 def test_mock_pairsam():
-    mock_pairsam_path = os.path.join(testdir, "data", "mock.stats.pairs")
+    mock_pairsam_path = os.path.join(testdir, "data", "mock.4stats.pairs")
     try:
         result = subprocess.check_output(
             ["python", "-m", "pairtools", "stats", mock_pairsam_path],

From 0a0207982f5027733e0d852e040192b88b4f37b5 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 10:35:51 +0200
Subject: [PATCH 09/20] Address comments

---
 pairtools/pairtools_stats.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 1c8682d8..25349e3b 100755
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -28,10 +28,9 @@
     " the end of the file.",
 )
 @click.option(
-    "--analyse-bytile-dups",
-    type=str,
-    default="",
-    help="If specified, will analyse by-tile duplication statistics to estimate"
+    "--bytile-dups/--no-bytile-dups",
+    default=False,
+    help="If enabled, will analyse by-tile duplication statistics to estimate"
     " library complexity more accurately."
     " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
 )
@@ -176,6 +175,9 @@ def estimate_library_complexity(nseq, ndup, nopticaldup=0):
         Estimated complexity
     """
     nseq = nseq - nopticaldup
+    if nseq == 0:
+        warnings.warn("Empty of fully duplicated library, can't estimate complexity")
+        return 0
     ndup = ndup - nopticaldup
     u = (nseq - ndup) / nseq
     seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u
@@ -413,10 +415,14 @@ def calculate_summaries(self):
             "cis_40kb+",
         ):
             self._stat["summary"][f"frac_{cis_count}"] = (
-                self._stat[cis_count] / self._stat["total_nodups"]
+                (self._stat[cis_count] / self._stat["total_nodups"])
+                if self._stat["total_nodups"] > 0
+                else 0
             )
         self._stat["summary"]["frac_dups"] = (
-            self._stat["total_dups"] / self._stat["total_mapped"]
+            (self._stat["total_dups"] / self._stat["total_mapped"])
+            if self._stat["total_mapped"] > 0
+            else 0
         )
         self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
             self._stat["total_mapped"], self._stat["total_dups"], 0

From f0bb8ca26945ea531f5acb265cf7307ad6f31f58 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 10:38:37 +0200
Subject: [PATCH 10/20] FIx argument name

---
 pairtools/pairtools_stats.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
index 25349e3b..f0dd71cd 100755
--- a/pairtools/pairtools_stats.py
+++ b/pairtools/pairtools_stats.py
@@ -43,9 +43,7 @@
     " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
 )
 @common_io_options
-def stats(
-    input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs
-):
+def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs):
     """Calculate pairs statistics.
 
     INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
@@ -56,17 +54,12 @@ def stats(
     The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
     """
     stats_py(
-        input_path,
-        output,
-        merge,
-        analyse_bytile_dups,
-        output_bytile_dups_stats,
-        **kwargs,
+        input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs,
     )
 
 
 def stats_py(
-    input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs
+    input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs
 ):
     if merge:
         do_merge(output, input_path, **kwargs)
@@ -96,13 +89,11 @@ def stats_py(
     header, body_stream = _headerops.get_header(instream)
     cols = _headerops.extract_column_names(header)
 
-    if (
-        analyse_bytile_dups or output_bytile_dups_stats
-    ) and "parent_readID" not in cols:
+    if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols:
         warnings.warn(
             "No 'parent_readID' column in the file, not generating duplicate stats."
         )
-        analyse_bytile_dups = False
+        bytile_dups = False
         output_bytile_dups_stats = False
     # new stats class stuff would come here ...
     stats = PairCounter()
@@ -608,9 +599,7 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
         else:
             self._stat["total_single_sided_mapped"] += 1
 
-    def add_pairs_from_dataframe(
-        self, df, unmapped_chrom="!", analyse_bytile_dups=False
-    ):
+    def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False):
         """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter.
     
         Parameters
@@ -687,7 +676,7 @@ def add_pairs_from_dataframe(
         self._stat["cis_40kb+"] += int(np.sum(dist >= 40000))
 
         ### Add by-tile dups
-        if analyse_bytile_dups and dups.shape[0] > 0:
+        if bytile_dups and dups.shape[0] > 0:
             self.bytile_dups = self.bytile_dups.add(
                 analyse_duplicate_stats(dups), fill_value=0
             ).astype(int)

From 9a378fc27a1e8afffbc8fcd940728c3edce07588 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 14:13:34 +0200
Subject: [PATCH 11/20] Split dedup and stats into lib and cli for 1.0

---
 pairtools/__init__.py        |  119 +---
 pairtools/_dedup.pyx         |  293 ----------
 pairtools/cli/__init__.py    |  206 +++++++
 pairtools/cli/dedup.py       |  511 +++++++++++++++++
 pairtools/cli/stats.py       |  117 ++++
 pairtools/pairtools_dedup.py | 1025 ----------------------------------
 pairtools/pairtools_stats.py |  835 ---------------------------
 7 files changed, 837 insertions(+), 2269 deletions(-)
 delete mode 100644 pairtools/_dedup.pyx
 create mode 100644 pairtools/cli/__init__.py
 create mode 100644 pairtools/cli/dedup.py
 create mode 100755 pairtools/cli/stats.py
 delete mode 100644 pairtools/pairtools_dedup.py
 delete mode 100755 pairtools/pairtools_stats.py

diff --git a/pairtools/__init__.py b/pairtools/__init__.py
index ceda0d19..e4aa6788 100644
--- a/pairtools/__init__.py
+++ b/pairtools/__init__.py
@@ -1,128 +1,15 @@
-# -*- coding: utf-8 -*-
 """
 pairtools
 ~~~~~~~~~
 
 CLI tools to process mapped Hi-C data
 
-:copyright: (c) 2017-2021 Open2C
+:copyright: (c) 2017-2022 Open2C
 :author: Open2C
 :license: MIT
 
 """
 
-__version__ = '0.3.1-dev.1'
+__version__ = "1.0.0-dev1"
 
-
-import click
-import functools
-import sys
-
-CONTEXT_SETTINGS = {
-    'help_option_names': ['-h', '--help'],
-}
-
-
-@click.version_option(version=__version__)
-@click.group(context_settings=CONTEXT_SETTINGS)
-@click.option(
-    '--post-mortem', 
-    help="Post mortem debugging", 
-    is_flag=True,
-    default=False
-)
-
-@click.option(
-    '--output-profile', 
-    help="Profile performance with Python cProfile and dump the statistics "
-         "into a binary file", 
-    type=str,
-    default=''
-)
-def cli(post_mortem, output_profile):
-    '''Flexible tools for Hi-C data processing.
-
-    All pairtools have a few common options, which should be typed _before_ 
-    the command name.
-
-    '''
-    if post_mortem:
-        import traceback
-        try:
-            import ipdb as pdb
-        except ImportError:
-            import pdb
-        def _excepthook(exc_type, value, tb):
-            traceback.print_exception(exc_type, value, tb)
-            print()
-            pdb.pm()
-        sys.excepthook = _excepthook
-
-    if output_profile:
-        import cProfile 
-        import atexit
-        
-        pr = cProfile.Profile()
-        pr.enable()
-
-        def _atexit_profile_hook():
-            pr.disable()
-            pr.dump_stats(output_profile)
-
-        atexit.register(_atexit_profile_hook)
-
-
-def common_io_options(func):
-    @click.option(
-        '--nproc-in',
-        type=int, 
-        default=3, 
-        show_default=True,
-        help='Number of processes used by the auto-guessed input decompressing command.'
-        )
-    @click.option(
-        '--nproc-out',
-        type=int, 
-        default=8, 
-        show_default=True,
-        help='Number of processes used by the auto-guessed output compressing command.'
-        )
-    @click.option(
-        '--cmd-in',
-        type=str, 
-        default=None, 
-        help='A command to decompress the input file. '
-             'If provided, fully overrides the auto-guessed command. '
-             'Does not work with stdin and pairtools parse. '
-             'Must read input from stdin and print output into stdout. '
-             'EXAMPLE: pbgzip -dc -n 3'
-        )
-    @click.option(
-        '--cmd-out',
-        type=str, 
-        default=None, 
-        help='A command to compress the output file. '
-             'If provided, fully overrides the auto-guessed command. '
-             'Does not work with stdout. '
-             'Must read input from stdin and print output into stdout. '
-             'EXAMPLE: pbgzip -c -n 8'
-        )
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        return func(*args, **kwargs)
-    return wrapper
-
-from .pairtools_dedup import dedup
-from .pairtools_sort import sort
-from .pairtools_flip import flip
-from .pairtools_merge import merge
-from .pairtools_markasdup import markasdup
-from .pairtools_select import select
-from .pairtools_split import split
-from .pairtools_restrict import restrict
-from .pairtools_phase import phase
-from .pairtools_parse import parse
-from .pairtools_stats import stats
-from .pairtools_sample import sample
-from .pairtools_filterbycov import filterbycov
+# from . import lib
diff --git a/pairtools/_dedup.pyx b/pairtools/_dedup.pyx
deleted file mode 100644
index 2c08db9a..00000000
--- a/pairtools/_dedup.pyx
+++ /dev/null
@@ -1,293 +0,0 @@
-"""
-Legacy code:
-``mark_duplicates`` is an offline method that finds duplicates in a given
-input dataset.
-
-For other applications on much larger datasets you may consider an online
-method ``OnlineDuplicateDetector`` which is implemented as a class.
-Note that for both methods data types are fixed:
-    * chromosomes are int32
-    * position is int32
-    * strand is int32, which is basically the same as C type "char".
-"""
-import numpy as np
-import cython
-
-cimport numpy as np
-cimport cython
-
-
-### Legacy offfline deduplicator:
-def mark_duplicates(
-        cython.int [:] c1,
-        cython.int [:] c2,
-        cython.int [:] p1,
-        cython.int [:] p2,
-        cython.int [:] s1,
-        cython.int [:] s2,
-        #uncomment for testing probably since it will do boundary check
-        #np.ndarray[np.int16_t, ndim=1]c2,
-        #np.ndarray[np.int32_t, ndim=1] p1,
-        #np.ndarray[np.int32_t, ndim=1] p2,
-        #np.ndarray[np.int8_t, ndim=1] s1,
-        #np.ndarray[np.int8_t, ndim=1] s2,
-        int max_mismatch=3,
-        method = "sum"):
-    """
-    Mark duplicates, allowing for some mismatch on the both sides of the molecule.
-    You can use it to filter single-cell data as well by setting max_mismatch to
-    500bp or even larger. It works as fast as we could make it.
-    This methods scans through a list of reads. It then flags duplicates,
-    which are defined as molecules, with both ends located within `max_mismatch`
-    bp from each other.
-
-    There are two ways define duplicates:
-    "max": two reads are duplicates if the mismatch of the genomic locations
-    of both ends is less-or-equal "max_mismatch"
-    "sum": two reads are duplicates if the sum of the mismatches of the either
-    ends of the molecule is less-or-equal "max_mismatch"
-    Other methods could be added below by editing the code
-
-    Parameters
-    ----------
-    c1, c2 : int32 array
-        chromosome IDs
-
-    p1, p2 : int32 arrays
-        positions
-
-    s1, s2 : int32 (or bool) arrays
-        strands
-
-    max_mismatch : int
-    method : "sum" or "max"
-        use the sum of mismatches, or the max of the two
-
-    Returns
-    -------
-    mask : int8 array
-        A binary mask, where 1 denotes that a read is a duplicate.
-    Notes
-    -----
-    Arrays MUST be ordered by (c1, p1)
-
-    """
-    cdef int N = len(c1)
-    cdef np.ndarray[np.int8_t, ndim=1] mask = np.zeros(N, dtype=np.int8)
-    cdef int low = 0
-    cdef int high = 1
-    cdef int extraCondition
-    cdef int methodid
-
-    if method == "max":
-        methodid = 0
-    elif method == "sum":
-        methodid = 1
-    else:
-        raise ValueError('method should be "sum" or "max"')
-
-    while True:
-        assert False
-        if low == N:
-            break
-
-        if high == N:
-            low += 1
-            high = low + 1
-            continue
-
-        if mask[low] == 1:
-            low += 1
-            high = low+1
-            continue
-
-        # if high already removed, just continue
-        if mask[high] == 1:
-            high += 1
-            continue
-
-        # if we jumped too far, continue
-        if (c1[high] != c1[low]) or (p1[high] - p1[low] > max_mismatch) or (p1[high] < p1[low]) or (c2[high] != c2[low]):
-            low += 1
-            high = low + 1  # restart high
-            continue
-
-        if methodid == 0:
-            extraCondition = (max(abs(p1[low] - p1[high]),
-                                  abs(p2[low] - p2[high])) <= max_mismatch)
-        elif methodid == 1:
-            extraCondition = (abs(p1[low] - p1[high]) +
-                              abs(p2[low] - p2[high]) <= max_mismatch)
-        else:
-            raise ValueError(
-                "Unknown method id, this should not happen. "
-                "Check code of this function.")
-
-        if ((c2[low] == c2[high]) and (s1[low] == s1[high]) and
-            (s2[low] == s2[high]) and extraCondition):
-            mask[high] = 1
-            high += 1
-            continue
-        high += 1
-
-    return mask
-
-
-### Online deduplicator used in pairtools.dedup Cython:
-cdef class OnlineDuplicateDetector(object):
-    cdef cython.int [:] c1
-    cdef cython.int [:] c2
-    cdef cython.int [:] p1
-    cdef cython.int [:] p2
-    cdef cython.int [:] s1
-    cdef cython.int [:] s2
-    cdef cython.char [:] rm
-    cdef cython.int [:] parent_idxs
-    cdef int methodid
-    cdef int low
-    cdef int high
-    cdef int N
-    cdef int max_mismatch
-    cdef int returnData
-    cdef int keep_parent_id
-
-    def __init__(self, method, max_mismatch, returnData=False, keep_parent_id=False):
-        if returnData == False:
-            self.returnData = 0
-        else:
-            self.returnData = 1
-        if keep_parent_id == False:
-            self.keep_parent_id = 0
-        else:
-            self.keep_parent_id = 1
-            self.parent_idxs = np.zeros(0, np.int32)
-
-        self.N = 0
-        self.c1 = np.zeros(0, np.int32)
-        self.c2 = np.zeros(0, np.int32)
-        self.p1 = np.zeros(0, np.int32)
-        self.p2 = np.zeros(0, np.int32)
-        self.s1 = np.zeros(0, np.int32)
-        self.s2 = np.zeros(0, np.int32)
-
-        self.rm = np.zeros(0, np.int8)
-        if method == "max":
-            self.methodid = 0
-        elif method == "sum":
-            self.methodid = 1
-        else:
-            raise ValueError('method should be "sum" or "max"')
-        self.max_mismatch = int(max_mismatch)
-        self.low = 0
-        self.high = 1
-
-    def _shrink(self):
-        if self.returnData == 1:
-            firstret = self.rm[:self.low]
-            retainMask = (np.asarray(firstret) == False)
-            del firstret
-            ret = []
-            for ar in [self.c1, self.c2, self.p1, self.p2, self.s1, self.s2]:
-                ret.append(np.asarray(ar)[:self.low][retainMask])
-        self.c1 = self.c1[self.low:]
-        self.c2 = self.c2[self.low:]
-        self.p1 = self.p1[self.low:]
-        self.p2 = self.p2[self.low:]
-        self.s1 = self.s1[self.low:]
-        self.s2 = self.s2[self.low:]
-        pastrm = self.rm[:self.low]
-        self.rm = self.rm[self.low:]
-        self.high = self.high-self.low
-        self.N = self.N - self.low
-        if self.returnData == 1:
-            self.low = 0
-            return ret
-        if self.keep_parent_id == 1: # Return parent readIDs alongside with duplicates mask:
-            pastidx = self.parent_idxs[:self.low]
-            self.low = 0
-            return pastrm, pastidx
-        return pastrm
-
-    def _run(self, finish=False):
-        cdef int finishing = 0
-        cdef int extraCondition
-
-        if finish:
-            finishing = 1
-
-        while True:
-            if self.low == self.N:
-                break
-
-            if self.high == self.N:
-                if finishing == 1:
-                    self.low += 1
-                    self.high = self.low + 1
-                    continue
-                else:
-                    break
-
-            if self.rm[self.low] == 1:
-                self.low += 1
-                self.high = self.low+1
-                continue
-
-            # if high already removed, just continue
-            if self.rm[self.high] == 1:
-                self.high += 1
-                continue
-
-            # if we jumped too far, continue
-            if ((self.c1[self.high] != self.c1[self.low]) or
-                (self.p1[self.high] - self.p1[self.low] > self.max_mismatch)   or
-                (self.p1[self.high] - self.p1[self.low] < 0  )):
-                self.low += 1
-                self.high = self.low + 1  # restart high
-                continue
-
-            if self.methodid == 0:
-                extraCondition = max(
-                    abs(self.p1[self.low] - self.p1[self.high]),
-                    abs(self.p2[self.low] - self.p2[self.high])) <= self.max_mismatch
-            elif self.methodid == 1:
-                # sum of distances <= max_mismatch
-                extraCondition = (
-                    abs(self.p1[self.low] - self.p1[self.high]) +
-                    abs(self.p2[self.low] - self.p2[self.high]) <= self.max_mismatch
-                )
-            else:
-                raise ValueError(
-                    "Unknown method id, this should not happen. "
-                    "Check code of this function.")
-
-            if ((self.c2[self.low] == self.c2[self.high]) and
-                    (self.s1[self.low] == self.s1[self.high]) and
-                    (self.s2[self.low] == self.s2[self.high]) and
-                    extraCondition):
-                self.rm[self.high] = 1
-                if self.keep_parent_id == 1:
-                    self.parent_idxs[self.high] = self.low
-                self.high += 1
-                continue
-            self.high += 1
-
-        return self._shrink()
-
-    def push(self, c1, c2, p1, p2, s1, s2):
-        self.c1 = np.concatenate([self.c1, c1])
-        self.c2 = np.concatenate([self.c2, c2])
-        self.p1 = np.concatenate([self.p1, p1])
-        self.p2 = np.concatenate([self.p2, p2])
-        self.s1 = np.concatenate([self.s1, s1])
-        self.s2 = np.concatenate([self.s2, s2])
-        self.rm = np.concatenate([self.rm, np.zeros(len(c1), dtype=np.int8)])
-        if self.keep_parent_id == 1:
-            self.parent_idxs = np.concatenate([self.parent_idxs, np.zeros(len(c1), dtype=np.int32)])
-        self.N = self.N + len(c1)
-        return self._run(finish=False)
-
-    def finish(self):
-        return self._run(finish=True)
-
-    def getLen(self):
-        return int(self.N)
\ No newline at end of file
diff --git a/pairtools/cli/__init__.py b/pairtools/cli/__init__.py
new file mode 100644
index 00000000..6fa156da
--- /dev/null
+++ b/pairtools/cli/__init__.py
@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+
+import click
+import functools
+import sys
+from .. import __version__
+import logging
+from .._logging import get_logger
+
+
+CONTEXT_SETTINGS = {
+    "help_option_names": ["-h", "--help"],
+}
+
+
+@click.version_option(version=__version__)
+@click.group(context_settings=CONTEXT_SETTINGS)
+@click.option(
+    "--post-mortem", help="Post mortem debugging", is_flag=True, default=False
+)
+@click.option(
+    "--output-profile",
+    help="Profile performance with Python cProfile and dump the statistics "
+    "into a binary file",
+    type=str,
+    default="",
+)
+@click.option("-v", "--verbose", help="Verbose logging.", count=True)
+@click.option(
+    "-d",
+    "--debug",
+    help="On error, drop into the post-mortem debugger shell.",
+    is_flag=True,
+    default=False,
+)
+def cli(post_mortem, output_profile, verbose, debug):
+    """Flexible tools for Hi-C data processing.
+
+    All pairtools have a few common options, which should be typed _before_
+    the command name.
+
+    """
+    if post_mortem:
+        import traceback
+
+        try:
+            import ipdb as pdb
+        except ImportError:
+            import pdb
+
+        def _excepthook(exc_type, value, tb):
+            traceback.print_exception(exc_type, value, tb)
+            print()
+            pdb.pm()
+
+        sys.excepthook = _excepthook
+
+    if output_profile:
+        import cProfile
+        import atexit
+
+        pr = cProfile.Profile()
+        pr.enable()
+
+        def _atexit_profile_hook():
+            pr.disable()
+            pr.dump_stats(output_profile)
+
+        atexit.register(_atexit_profile_hook)
+
+    # Initialize logging to stderr
+    logging.basicConfig(stream=sys.stderr)
+    logging.captureWarnings(True)
+    root_logger = get_logger()
+
+    # Set verbosity level
+    if verbose > 0:
+        root_logger.setLevel(logging.DEBUG)
+        if verbose > 1:  # pragma: no cover
+            try:
+                import psutil
+                import atexit
+
+                @atexit.register
+                def process_dump_at_exit():
+                    process_attrs = [
+                        "cmdline",
+                        # 'connections',
+                        "cpu_affinity",
+                        "cpu_num",
+                        "cpu_percent",
+                        "cpu_times",
+                        "create_time",
+                        "cwd",
+                        # 'environ',
+                        "exe",
+                        # 'gids',
+                        "io_counters",
+                        "ionice",
+                        "memory_full_info",
+                        # 'memory_info',
+                        # 'memory_maps',
+                        "memory_percent",
+                        "name",
+                        "nice",
+                        "num_ctx_switches",
+                        "num_fds",
+                        "num_threads",
+                        "open_files",
+                        "pid",
+                        "ppid",
+                        "status",
+                        "terminal",
+                        "threads",
+                        # 'uids',
+                        "username",
+                    ]
+                    p = psutil.Process()
+                    info_ = p.as_dict(process_attrs, ad_value="")
+                    for key in process_attrs:
+                        root_logger.debug("PSINFO:'{}': {}".format(key, info_[key]))
+
+            except ImportError:
+                root_logger.warning("Install psutil to see process information.")
+
+    else:
+        root_logger.setLevel(logging.INFO)
+
+    # Set hook for postmortem debugging
+    if debug:  # pragma: no cover
+        import traceback
+
+        try:
+            import ipdb as pdb
+        except ImportError:
+            import pdb
+
+        def _excepthook(exc_type, value, tb):
+            traceback.print_exception(exc_type, value, tb)
+            print()
+            pdb.pm()
+
+        sys.excepthook = _excepthook
+
+
+def common_io_options(func):
+    @click.option(
+        "--nproc-in",
+        type=int,
+        default=3,
+        show_default=True,
+        help="Number of processes used by the auto-guessed input decompressing command.",
+    )
+    @click.option(
+        "--nproc-out",
+        type=int,
+        default=8,
+        show_default=True,
+        help="Number of processes used by the auto-guessed output compressing command.",
+    )
+    @click.option(
+        "--cmd-in",
+        type=str,
+        default=None,
+        help="A command to decompress the input file. "
+        "If provided, fully overrides the auto-guessed command. "
+        "Does not work with stdin and pairtools parse. "
+        "Must read input from stdin and print output into stdout. "
+        "EXAMPLE: pbgzip -dc -n 3",
+    )
+    @click.option(
+        "--cmd-out",
+        type=str,
+        default=None,
+        help="A command to compress the output file. "
+        "If provided, fully overrides the auto-guessed command. "
+        "Does not work with stdout. "
+        "Must read input from stdin and print output into stdout. "
+        "EXAMPLE: pbgzip -c -n 8",
+    )
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+from . import (
+    dedup,
+    sort,
+    flip,
+    merge,
+    markasdup,
+    select,
+    split,
+    restrict,
+    phase,
+    parse,
+    parse2,
+    stats,
+    sample,
+    filterbycov,
+    header,
+    scaling,
+)
+
diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
new file mode 100644
index 00000000..dd7ea14d
--- /dev/null
+++ b/pairtools/cli/dedup.py
@@ -0,0 +1,511 @@
+#!/usr/bin/env python
+# -*- coding: utf-8  -*-
+
+import sys
+import ast
+import pathlib
+
+# from distutils.log import warn
+# import warnings
+
+from ..lib import fileio, pairsam_format, headerops
+from . import cli, common_io_options
+
+import click
+
+from ..lib.dedup import streaming_dedup, streaming_dedup_cython
+from ..lib.stats import PairCounter
+
+UTIL_NAME = "pairtools_dedup"
+
+
+@cli.command()
+@click.argument("pairs_path", type=str, required=False)
+
+### Output files:
+@click.option(
+    "-o",
+    "--output",
+    type=str,
+    default="",
+    help="output file for pairs after duplicate removal."
+    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
+    " By default, the output is printed into stdout.",
+)
+@click.option(
+    "--output-dups",
+    type=str,
+    default="",
+    help="output file for duplicated pairs. "
+    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
+    " If the path is the same as in --output or -, output duplicates together "
+    " with deduped pairs. By default, duplicates are dropped.",
+)
+@click.option(
+    "--output-unmapped",
+    type=str,
+    default="",
+    help="output file for unmapped pairs. "
+    "If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed. "
+    "If the path is the same as in --output or -, output unmapped pairs together "
+    "with deduped pairs. If the path is the same as --output-dups, output "
+    "unmapped reads together with dups. By default, unmapped pairs are dropped.",
+)
+@click.option(
+    "--output-stats",
+    type=str,
+    default="",
+    help="output file for duplicate statistics."
+    " If file exists, it will be open in the append mode."
+    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
+    " By default, statistics are not printed.",
+)
+@click.option(
+    "--output-bytile-dups-stats",
+    type=str,
+    default="",
+    help="output file for duplicate statistics."
+    " If file exists, it will be open in the append mode."
+    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
+    " By default, by-tile duplicate statistics are not printed.",
+)
+### Set the dedup method:
+@click.option(
+    "--max-mismatch",
+    type=int,
+    default=3,
+    show_default=True,
+    help="Pairs with both sides mapped within this distance (bp) from each "
+    "other are considered duplicates. ",
+)
+@click.option(
+    "--method",
+    type=click.Choice(["max", "sum"]),
+    default="max",
+    help="define the mismatch as either the max or the sum of the mismatches of"
+    "the genomic locations of the both sides of the two compared molecules",
+    show_default=True,
+)
+@click.option(
+    "--backend",
+    type=click.Choice(["scipy", "sklearn", "cython"]),
+    default="scipy",
+    help="What backend to use: scipy and sklearn are based on KD-trees,"
+    " cython is online indexed list-based algorithm."
+    " With cython backend, duplication is not transitive with non-zero max mismatch "
+    " (e.g. pairs A and B are duplicates, and B and C are duplicates, then A and C are "
+    " not necessary duplicates of each other), while with scipy and sklearn it's "
+    " transitive (i.e. A and C are necessarily duplicates)."
+    " Cython is the original version used in pairtools since its beginning."
+    " It is available for backwards compatibility and to allow specification of the"
+    " column order."
+    " Now the default scipy backend is generally the fastest, and with chunksize below"
+    " 1 mln has the lowest memory requirements."
+    # " 'cython' is deprecated and provided for backwards compatibility",
+)
+
+### Scipy and sklearn-specific options:
+@click.option(
+    "--chunksize",
+    type=int,
+    default=100_000,
+    show_default=True,
+    help="Number of pairs in each chunk. Reduce for lower memory footprint."
+    " Below 10,000 performance starts suffering significantly and the algorithm might"
+    " miss a few duplicates with non-zero --max-mismatch."
+    " Only works with '--backend scipy or sklearn'",
+)
+@click.option(
+    "--carryover",
+    type=int,
+    default=100,
+    show_default=True,
+    help="Number of deduped pairs to carry over from previous chunk to the new chunk"
+    " to avoid breaking duplicate clusters."
+    " Only works with '--backend scipy or sklearn'",
+)
+@click.option(
+    "-p",
+    "--n-proc",
+    type=int,
+    default=1,
+    help="Number of cores to use. Only applies with sklearn backend."
+    "Still needs testing whether it is ever useful.",
+)
+
+### Output options:
+@click.option(
+    "--mark-dups",
+    is_flag=True,
+    help='If specified, duplicate pairs are marked as DD in "pair_type" and '
+    "as a duplicate in the sam entries.",
+)
+@click.option(
+    "--keep-parent-id",
+    is_flag=True,
+    help="If specified, duplicate pairs are marked with the readID of the retained"
+    " deduped read in the 'parent_readID' field.",
+)
+@click.option(
+    "--extra-col-pair",
+    nargs=2,
+    # type=click.Tuple([str, str]),
+    multiple=True,
+    help="Extra columns that also must match for two pairs to be marked as "
+    "duplicates. Can be either provided as 0-based column indices or as column "
+    'names (requires the "#columns" header field). The option can be provided '
+    "multiple times if multiple column pairs must match. "
+    'Example: --extra-col-pair "phase1" "phase2"',
+)
+
+### Input options:
+@click.option(
+    "--sep",
+    type=str,
+    default=pairsam_format.PAIRSAM_SEP_ESCAPE,
+    help=r"Separator (\t, \v, etc. characters are " "supported, pass them in quotes) ",
+)
+@click.option(
+    "--comment-char", type=str, default="#", help="The first character of comment lines"
+)
+@click.option(
+    "--send-header-to",
+    type=click.Choice(["dups", "dedup", "both", "none"]),
+    default="both",
+    help="Which of the outputs should receive header and comment lines",
+)
+@click.option(
+    "--c1",
+    type=int,
+    default=pairsam_format.COL_C1,
+    help=f"Chrom 1 column; default {pairsam_format.COL_C1}"
+    " Only works with '--backend cython'",
+)
+@click.option(
+    "--c2",
+    type=int,
+    default=pairsam_format.COL_C2,
+    help=f"Chrom 2 column; default {pairsam_format.COL_C2}"
+    " Only works with '--backend cython'",
+)
+@click.option(
+    "--p1",
+    type=int,
+    default=pairsam_format.COL_P1,
+    help=f"Position 1 column; default {pairsam_format.COL_P1}"
+    " Only works with '--backend cython'",
+)
+@click.option(
+    "--p2",
+    type=int,
+    default=pairsam_format.COL_P2,
+    help=f"Position 2 column; default {pairsam_format.COL_P2}"
+    " Only works with '--backend cython'",
+)
+@click.option(
+    "--s1",
+    type=int,
+    default=pairsam_format.COL_S1,
+    help=f"Strand 1 column; default {pairsam_format.COL_S1}"
+    " Only works with '--backend cython'",
+)
+@click.option(
+    "--s2",
+    type=int,
+    default=pairsam_format.COL_S2,
+    help=f"Strand 2 column; default {pairsam_format.COL_S2}"
+    " Only works with '--backend cython'",
+)
+@click.option(
+    "--unmapped-chrom",
+    type=str,
+    default=pairsam_format.UNMAPPED_CHROM,
+    help="Placeholder for a chromosome on an unmapped side; default {}".format(
+        pairsam_format.UNMAPPED_CHROM
+    ),
+)
+@common_io_options
+def dedup(
+    pairs_path,
+    output,
+    output_dups,
+    output_unmapped,
+    output_stats,
+    output_bytile_dups_stats,
+    chunksize,
+    carryover,
+    max_mismatch,
+    method,
+    sep,
+    comment_char,
+    send_header_to,
+    c1,
+    c2,
+    p1,
+    p2,
+    s1,
+    s2,
+    unmapped_chrom,
+    mark_dups,
+    extra_col_pair,
+    keep_parent_id,
+    backend,
+    n_proc,
+    **kwargs,
+):
+    """Find and remove PCR/optical duplicates.
+
+    Find PCR duplicates in an upper-triangular flipped sorted pairs/pairsam
+    file. Allow for a +/-N bp mismatch at each side of duplicated molecules.
+
+    PAIRS_PATH : input triu-flipped sorted .pairs or .pairsam file.  If the
+    path ends with .gz/.lz4, the input is decompressed by bgzip/lz4c.
+    By default, the input is read from stdin.
+    """
+
+    dedup_py(
+        pairs_path,
+        output,
+        output_dups,
+        output_unmapped,
+        output_stats,
+        output_bytile_dups_stats,
+        chunksize,
+        carryover,
+        max_mismatch,
+        method,
+        sep,
+        comment_char,
+        send_header_to,
+        c1,
+        c2,
+        p1,
+        p2,
+        s1,
+        s2,
+        unmapped_chrom,
+        mark_dups,
+        extra_col_pair,
+        keep_parent_id,
+        backend,
+        n_proc,
+        **kwargs,
+    )
+
+
+if __name__ == "__main__":
+    dedup()
+
+
+def dedup_py(
+    pairs_path,
+    output,
+    output_dups,
+    output_unmapped,
+    output_stats,
+    output_bytile_dups_stats,
+    chunksize,
+    carryover,
+    max_mismatch,
+    method,
+    sep,
+    comment_char,
+    send_header_to,
+    c1,
+    c2,
+    p1,
+    p2,
+    s1,
+    s2,
+    unmapped_chrom,
+    mark_dups,
+    extra_col_pair,
+    keep_parent_id,
+    backend,
+    n_proc,
+    **kwargs,
+):
+
+    sep = ast.literal_eval('"""' + sep + '"""')
+    send_header_to_dedup = send_header_to in ["both", "dedup"]
+    send_header_to_dup = send_header_to in ["both", "dups"]
+
+    instream = (
+        fileio.auto_open(
+            pairs_path,
+            mode="r",
+            nproc=kwargs.get("nproc_in"),
+            command=kwargs.get("cmd_in", None),
+        )
+        if pairs_path
+        else sys.stdin
+    )
+    outstream = (
+        fileio.auto_open(
+            output,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+        if output
+        else sys.stdout
+    )
+    out_stats_stream = (
+        fileio.auto_open(
+            output_stats,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+        if output_stats
+        else None
+    )
+
+    out_bytile_dups_stats_stream = (
+        fileio.auto_open(
+            output_bytile_dups_stats,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+        if output_bytile_dups_stats
+        else None
+    )
+
+    # generate empty PairCounter if stats output is requested:
+    out_stat = PairCounter() if output_stats else None
+
+    if not output_dups:
+        outstream_dups = None
+    elif output_dups == "-" or (
+        pathlib.Path(output_dups).absolute() == pathlib.Path(output).absolute()
+    ):
+        outstream_dups = outstream
+    else:
+        outstream_dups = fileio.auto_open(
+            output_dups,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+
+    if not output_unmapped:
+        outstream_unmapped = None
+    elif output_unmapped == "-" or (
+        pathlib.Path(output_unmapped).absolute() == pathlib.Path(output).absolute()
+    ):
+        outstream_unmapped = outstream
+    elif (
+        pathlib.Path(output_unmapped).absolute() == pathlib.Path(output_dups).absolute()
+    ):
+        outstream_unmapped = outstream_dups
+    else:
+        outstream_unmapped = fileio.auto_open(
+            output_unmapped,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+
+    header, body_stream = headerops.get_header(instream)
+    header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
+    if send_header_to_dedup:
+        outstream.writelines((l + "\n" for l in header))
+    if send_header_to_dup and outstream_dups and (outstream_dups != outstream):
+        dups_header = header
+        if keep_parent_id and len(dups_header) > 0:
+            dups_header = headerops.append_columns(dups_header, ["parent_readID"])
+        outstream_dups.writelines((l + "\n" for l in dups_header))
+    if (
+        outstream_unmapped
+        and (outstream_unmapped != outstream)
+        and (outstream_unmapped != outstream_dups)
+    ):
+        outstream_unmapped.writelines((l + "\n" for l in header))
+
+    column_names = headerops.extract_column_names(header)
+    extra_cols1 = []
+    extra_cols2 = []
+    if extra_col_pair is not None:
+        for col1, col2 in extra_col_pair:
+            extra_cols1.append(column_names[col1] if col1.isdigit() else col1)
+            extra_cols2.append(column_names[col2] if col2.isdigit() else col2)
+
+    if backend == "cython":
+        # warnings.warn(
+        #     "'cython' backend is deprecated and provided only"
+        #     " for backwards compatibility",
+        #     DeprecationWarning,
+        # )
+        extra_cols1 = [column_names.index(col) for col in extra_cols1]
+        extra_cols2 = [column_names.index(col) for col in extra_cols2]
+        streaming_dedup_cython(
+            method,
+            max_mismatch,
+            sep,
+            c1,
+            c2,
+            p1,
+            p2,
+            s1,
+            s2,
+            extra_cols1,
+            extra_cols2,
+            unmapped_chrom,
+            body_stream,
+            outstream,
+            outstream_dups,
+            outstream_unmapped,
+            out_stat,
+            mark_dups,
+            keep_parent_id,
+        )
+    elif backend in ("scipy", "sklearn"):
+        streaming_dedup(
+            in_stream=instream,
+            colnames=column_names,
+            chunksize=chunksize,
+            carryover=carryover,
+            method=method,
+            mark_dups=mark_dups,
+            max_mismatch=max_mismatch,
+            extra_col_pairs=list(extra_col_pair),
+            keep_parent_id=keep_parent_id,
+            unmapped_chrom=unmapped_chrom,
+            comment_char=comment_char,
+            outstream=outstream,
+            outstream_dups=outstream_dups,
+            outstream_unmapped=outstream_unmapped,
+            out_stat=out_stat,
+            backend=backend,
+            n_proc=n_proc,
+        )
+    else:
+        raise ValueError("Unknown backend")
+
+    # save statistics to a file if it was requested:
+    if out_stat:
+        out_stat.save(out_stats_stream)
+
+    if output_bytile_dups_stats:
+        out_stat.save_bytile_dups(out_bytile_dups_stats_stream)
+
+    if instream != sys.stdin:
+        instream.close()
+
+    if outstream != sys.stdout:
+        outstream.close()
+
+    if outstream_dups and (outstream_dups != outstream):
+        outstream_dups.close()
+
+    if (
+        outstream_unmapped
+        and (outstream_unmapped != outstream)
+        and (outstream_unmapped != outstream_dups)
+    ):
+        outstream_unmapped.close()
+
+    if out_stats_stream:
+        out_stats_stream.close()
+
diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py
new file mode 100755
index 00000000..4fb10935
--- /dev/null
+++ b/pairtools/cli/stats.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import io
+import sys
+import click
+import warnings
+
+import pandas as pd
+from ..lib import fileio, pairsam_format, headerops
+from . import cli, common_io_options
+
+from ..lib.stats import PairCounter, do_merge
+
+UTIL_NAME = "pairtools_stats"
+
+
+@cli.command()
+@click.argument("input_path", type=str, nargs=-1, required=False)
+@click.option("-o", "--output", type=str, default="", help="output stats tsv file.")
+@click.option(
+    "--merge",
+    is_flag=True,
+    help="If specified, merge multiple input stats files instead of calculating"
+    " statistics of a .pairs/.pairsam file. Merging is performed via summation of"
+    " all overlapping statistics. Non-overlapping statistics are appended to"
+    " the end of the file.",
+)
+@click.option(
+    "--bytile-dups/--no-bytile-dups",
+    default=False,
+    help="If enabled, will analyse by-tile duplication statistics to estimate"
+    " library complexity more accurately."
+    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
+)
+@click.option(
+    "--output-bytile-dups-stats",
+    type=str,
+    default="",
+    help="If specified, will analyse by-tile duplication statistics to estimate"
+    " library complexity more accurately and will save details to this path."
+    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
+)
+@common_io_options
+def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs):
+    """Calculate pairs statistics.
+
+    INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
+    If not provided, the input is read from stdin.
+    If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number
+    of stats files to merge.
+
+    The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
+    """
+    stats_py(
+        input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs,
+    )
+
+
+def stats_py(
+    input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs
+):
+    if merge:
+        do_merge(output, input_path, **kwargs)
+        return
+
+    instream = (
+        fileio.auto_open(
+            input_path[0],
+            mode="r",
+            nproc=kwargs.get("nproc_in"),
+            command=kwargs.get("cmd_in", None),
+        )
+        if input_path
+        else sys.stdin
+    )
+    outstream = (
+        fileio.auto_open(
+            output,
+            mode="w",
+            nproc=kwargs.get("nproc_out"),
+            command=kwargs.get("cmd_out", None),
+        )
+        if output
+        else sys.stdout
+    )
+
+    header, body_stream = headerops.get_header(instream)
+    cols = headerops.extract_column_names(header)
+
+    if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols:
+        warnings.warn(
+            "No 'parent_readID' column in the file, not generating duplicate stats."
+        )
+        bytile_dups = False
+        output_bytile_dups_stats = False
+    # new stats class stuff would come here ...
+    stats = PairCounter()
+
+    # Collecting statistics
+    for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):
+        stats.add_pairs_from_dataframe(chunk)
+
+    if output_bytile_dups_stats:
+        stats.save_bytile_dups()
+    stats.calculate_summaries()
+
+    # save statistics to file ...
+    stats.save(outstream)
+
+    if instream != sys.stdin:
+        instream.close()
+    if outstream != sys.stdout:
+        outstream.close()
+
+
+if __name__ == "__main__":
+    stats()
diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py
deleted file mode 100644
index cca5ce89..00000000
--- a/pairtools/pairtools_dedup.py
+++ /dev/null
@@ -1,1025 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8  -*-
-from distutils.log import warn
-import sys
-import ast
-import warnings
-import pathlib
-
-import click
-
-import numpy as np
-import pandas as pd
-
-import scipy.spatial
-from scipy.sparse import coo_matrix
-from scipy.sparse.csgraph import connected_components
-
-from . import _dedup, _fileio, _pairsam_format, _headerops, cli, common_io_options
-from .pairtools_markasdup import mark_split_pair_as_dup
-from .pairtools_stats import PairCounter
-
-
-import time
-
-UTIL_NAME = "pairtools_dedup"
-
-# you don't need to load more than 10k lines at a time b/c you get out of the
-# CPU cache, so this parameter is not adjustable
-MAX_LEN = 10000
-
-
-@cli.command()
-@click.argument("pairs_path", type=str, required=False)
-
-### Output files:
-@click.option(
-    "-o",
-    "--output",
-    type=str,
-    default="",
-    help="output file for pairs after duplicate removal."
-    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
-    " By default, the output is printed into stdout.",
-)
-@click.option(
-    "--output-dups",
-    type=str,
-    default="",
-    help="output file for duplicated pairs. "
-    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
-    " If the path is the same as in --output or -, output duplicates together "
-    " with deduped pairs. By default, duplicates are dropped.",
-)
-@click.option(
-    "--output-unmapped",
-    type=str,
-    default="",
-    help="output file for unmapped pairs. "
-    "If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed. "
-    "If the path is the same as in --output or -, output unmapped pairs together "
-    "with deduped pairs. If the path is the same as --output-dups, output "
-    "unmapped reads together with dups. By default, unmapped pairs are dropped.",
-)
-@click.option(
-    "--output-stats",
-    type=str,
-    default="",
-    help="output file for duplicate statistics."
-    " If file exists, it will be open in the append mode."
-    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
-    " By default, statistics are not printed.",
-)
-@click.option(
-    "--output-bytile-dups-stats",
-    type=str,
-    default="",
-    help="output file for duplicate statistics."
-    " If file exists, it will be open in the append mode."
-    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
-    " By default, by-tile duplicate statistics are not printed.",
-)
-### Set the dedup method:
-@click.option(
-    "--max-mismatch",
-    type=int,
-    default=3,
-    show_default=True,
-    help="Pairs with both sides mapped within this distance (bp) from each "
-    "other are considered duplicates. ",
-)
-@click.option(
-    "--method",
-    type=click.Choice(["max", "sum"]),
-    default="max",
-    help="define the mismatch as either the max or the sum of the mismatches of"
-    "the genomic locations of the both sides of the two compared molecules",
-    show_default=True,
-)
-@click.option(
-    "--backend",
-    type=click.Choice(["scipy", "sklearn", "cython"]),
-    default="scipy",
-    help="What backend to use: scipy and sklearn are based on KD-trees,"
-    " cython is online indexed list-based algorithm."
-    " With cython backend, duplication is not transitive with non-zero max mismatch "
-    " (e.g. pairs A and B are duplicates, and B and C are duplicates, then A and C are "
-    " not necessary duplicates of each other), while with scipy and sklearn it's "
-    " transitive (i.e. A and C are necessarily duplicates)."
-    " Cython is the original version used in pairtools since its beginning."
-    " It is available for backwards compatibility and to allow specification of the"
-    " column order."
-    " Now the default scipy backend is generally the fastest, and with chunksize below"
-    " 1 mln has the lowest memory requirements."
-    # " 'cython' is deprecated and provided for backwards compatibility",
-)
-
-### Scipy and sklearn-specific options:
-@click.option(
-    "--chunksize",
-    type=int,
-    default=100_000,
-    show_default=True,
-    help="Number of pairs in each chunk. Reduce for lower memory footprint."
-    " Below 10,000 performance starts suffering significantly and the algorithm might"
-    " miss a few duplicates with non-zero --max-mismatch."
-    " Only works with '--backend scipy or sklearn'",
-)
-@click.option(
-    "--carryover",
-    type=int,
-    default=100,
-    show_default=True,
-    help="Number of deduped pairs to carry over from previous chunk to the new chunk"
-    " to avoid breaking duplicate clusters."
-    " Only works with '--backend scipy or sklearn'",
-)
-@click.option(
-    "-p",
-    "--n-proc",
-    type=int,
-    default=1,
-    help="Number of cores to use. Only applies with sklearn backend."
-    "Still needs testing whether it is ever useful.",
-)
-
-### Output options:
-@click.option(
-    "--mark-dups",
-    is_flag=True,
-    help='If specified, duplicate pairs are marked as DD in "pair_type" and '
-    "as a duplicate in the sam entries.",
-)
-@click.option(
-    "--keep-parent-id",
-    is_flag=True,
-    help="If specified, duplicate pairs are marked with the readID of the retained"
-    " deduped read in the 'parent_readID' field.",
-)
-@click.option(
-    "--extra-col-pair",
-    nargs=2,
-    # type=click.Tuple([str, str]),
-    multiple=True,
-    help="Extra columns that also must match for two pairs to be marked as "
-    "duplicates. Can be either provided as 0-based column indices or as column "
-    'names (requires the "#columns" header field). The option can be provided '
-    "multiple times if multiple column pairs must match. "
-    'Example: --extra-col-pair "phase1" "phase2"',
-)
-
-### Input options:
-@click.option(
-    "--sep",
-    type=str,
-    default=_pairsam_format.PAIRSAM_SEP_ESCAPE,
-    help=r"Separator (\t, \v, etc. characters are " "supported, pass them in quotes) ",
-)
-@click.option(
-    "--comment-char", type=str, default="#", help="The first character of comment lines"
-)
-@click.option(
-    "--send-header-to",
-    type=click.Choice(["dups", "dedup", "both", "none"]),
-    default="both",
-    help="Which of the outputs should receive header and comment lines",
-)
-@click.option(
-    "--c1",
-    type=int,
-    default=_pairsam_format.COL_C1,
-    help=f"Chrom 1 column; default {_pairsam_format.COL_C1}"
-    " Only works with '--backend cython'",
-)
-@click.option(
-    "--c2",
-    type=int,
-    default=_pairsam_format.COL_C2,
-    help=f"Chrom 2 column; default {_pairsam_format.COL_C2}"
-    " Only works with '--backend cython'",
-)
-@click.option(
-    "--p1",
-    type=int,
-    default=_pairsam_format.COL_P1,
-    help=f"Position 1 column; default {_pairsam_format.COL_P1}"
-    " Only works with '--backend cython'",
-)
-@click.option(
-    "--p2",
-    type=int,
-    default=_pairsam_format.COL_P2,
-    help=f"Position 2 column; default {_pairsam_format.COL_P2}"
-    " Only works with '--backend cython'",
-)
-@click.option(
-    "--s1",
-    type=int,
-    default=_pairsam_format.COL_S1,
-    help=f"Strand 1 column; default {_pairsam_format.COL_S1}"
-    " Only works with '--backend cython'",
-)
-@click.option(
-    "--s2",
-    type=int,
-    default=_pairsam_format.COL_S2,
-    help=f"Strand 2 column; default {_pairsam_format.COL_S2}"
-    " Only works with '--backend cython'",
-)
-@click.option(
-    "--unmapped-chrom",
-    type=str,
-    default=_pairsam_format.UNMAPPED_CHROM,
-    help="Placeholder for a chromosome on an unmapped side; default {}".format(
-        _pairsam_format.UNMAPPED_CHROM
-    ),
-)
-@common_io_options
-def dedup(
-    pairs_path,
-    output,
-    output_dups,
-    output_unmapped,
-    output_stats,
-    output_bytile_dups_stats,
-    chunksize,
-    carryover,
-    max_mismatch,
-    method,
-    sep,
-    comment_char,
-    send_header_to,
-    c1,
-    c2,
-    p1,
-    p2,
-    s1,
-    s2,
-    unmapped_chrom,
-    mark_dups,
-    extra_col_pair,
-    keep_parent_id,
-    backend,
-    n_proc,
-    **kwargs,
-):
-    """Find and remove PCR/optical duplicates.
-
-    Find PCR duplicates in an upper-triangular flipped sorted pairs/pairsam
-    file. Allow for a +/-N bp mismatch at each side of duplicated molecules.
-
-    PAIRS_PATH : input triu-flipped sorted .pairs or .pairsam file.  If the
-    path ends with .gz/.lz4, the input is decompressed by bgzip/lz4c.
-    By default, the input is read from stdin.
-    """
-
-    dedup_py(
-        pairs_path,
-        output,
-        output_dups,
-        output_unmapped,
-        output_stats,
-        output_bytile_dups_stats,
-        chunksize,
-        carryover,
-        max_mismatch,
-        method,
-        sep,
-        comment_char,
-        send_header_to,
-        c1,
-        c2,
-        p1,
-        p2,
-        s1,
-        s2,
-        unmapped_chrom,
-        mark_dups,
-        extra_col_pair,
-        keep_parent_id,
-        backend,
-        n_proc,
-        **kwargs,
-    )
-
-
-if __name__ == "__main__":
-    dedup()
-
-
-def dedup_py(
-    pairs_path,
-    output,
-    output_dups,
-    output_unmapped,
-    output_stats,
-    output_bytile_dups_stats,
-    chunksize,
-    carryover,
-    max_mismatch,
-    method,
-    sep,
-    comment_char,
-    send_header_to,
-    c1,
-    c2,
-    p1,
-    p2,
-    s1,
-    s2,
-    unmapped_chrom,
-    mark_dups,
-    extra_col_pair,
-    keep_parent_id,
-    backend,
-    n_proc,
-    **kwargs,
-):
-
-    sep = ast.literal_eval('"""' + sep + '"""')
-    send_header_to_dedup = send_header_to in ["both", "dedup"]
-    send_header_to_dup = send_header_to in ["both", "dups"]
-
-    instream = (
-        _fileio.auto_open(
-            pairs_path,
-            mode="r",
-            nproc=kwargs.get("nproc_in"),
-            command=kwargs.get("cmd_in", None),
-        )
-        if pairs_path
-        else sys.stdin
-    )
-    outstream = (
-        _fileio.auto_open(
-            output,
-            mode="w",
-            nproc=kwargs.get("nproc_out"),
-            command=kwargs.get("cmd_out", None),
-        )
-        if output
-        else sys.stdout
-    )
-    out_stats_stream = (
-        _fileio.auto_open(
-            output_stats,
-            mode="w",
-            nproc=kwargs.get("nproc_out"),
-            command=kwargs.get("cmd_out", None),
-        )
-        if output_stats
-        else None
-    )
-
-    out_bytile_dups_stats_stream = (
-        _fileio.auto_open(
-            output_bytile_dups_stats,
-            mode="w",
-            nproc=kwargs.get("nproc_out"),
-            command=kwargs.get("cmd_out", None),
-        )
-        if output_bytile_dups_stats
-        else None
-    )
-
-    # generate empty PairCounter if stats output is requested:
-    out_stat = PairCounter() if output_stats else None
-
-    if not output_dups:
-        outstream_dups = None
-    elif output_dups == "-" or (
-        pathlib.Path(output_dups).absolute() == pathlib.Path(output).absolute()
-    ):
-        outstream_dups = outstream
-    else:
-        outstream_dups = _fileio.auto_open(
-            output_dups,
-            mode="w",
-            nproc=kwargs.get("nproc_out"),
-            command=kwargs.get("cmd_out", None),
-        )
-
-    if not output_unmapped:
-        outstream_unmapped = None
-    elif output_unmapped == "-" or (
-        pathlib.Path(output_unmapped).absolute() == pathlib.Path(output).absolute()
-    ):
-        outstream_unmapped = outstream
-    elif (
-        pathlib.Path(output_unmapped).absolute() == pathlib.Path(output_dups).absolute()
-    ):
-        outstream_unmapped = outstream_dups
-    else:
-        outstream_unmapped = _fileio.auto_open(
-            output_unmapped,
-            mode="w",
-            nproc=kwargs.get("nproc_out"),
-            command=kwargs.get("cmd_out", None),
-        )
-
-    header, body_stream = _headerops.get_header(instream)
-    header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
-    if send_header_to_dedup:
-        outstream.writelines((l + "\n" for l in header))
-    if send_header_to_dup and outstream_dups and (outstream_dups != outstream):
-        dups_header = header
-        if keep_parent_id and len(dups_header) > 0:
-            dups_header = _headerops.append_columns(dups_header, ["parent_readID"])
-        outstream_dups.writelines((l + "\n" for l in dups_header))
-    if (
-        outstream_unmapped
-        and (outstream_unmapped != outstream)
-        and (outstream_unmapped != outstream_dups)
-    ):
-        outstream_unmapped.writelines((l + "\n" for l in header))
-
-    column_names = _headerops.extract_column_names(header)
-    extra_cols1 = []
-    extra_cols2 = []
-    if extra_col_pair is not None:
-        for col1, col2 in extra_col_pair:
-            extra_cols1.append(column_names[col1] if col1.isdigit() else col1)
-            extra_cols2.append(column_names[col2] if col2.isdigit() else col2)
-
-    if backend == "cython":
-        # warnings.warn(
-        #     "'cython' backend is deprecated and provided only"
-        #     " for backwards compatibility",
-        #     DeprecationWarning,
-        # )
-        extra_cols1 = [column_names.index(col) for col in extra_cols1]
-        extra_cols2 = [column_names.index(col) for col in extra_cols2]
-        streaming_dedup_cython(
-            method,
-            max_mismatch,
-            sep,
-            c1,
-            c2,
-            p1,
-            p2,
-            s1,
-            s2,
-            extra_cols1,
-            extra_cols2,
-            unmapped_chrom,
-            body_stream,
-            outstream,
-            outstream_dups,
-            outstream_unmapped,
-            out_stat,
-            mark_dups,
-            keep_parent_id,
-        )
-    elif backend in ("scipy", "sklearn"):
-        streaming_dedup(
-            in_stream=instream,
-            colnames=column_names,
-            chunksize=chunksize,
-            carryover=carryover,
-            method=method,
-            mark_dups=mark_dups,
-            max_mismatch=max_mismatch,
-            extra_col_pairs=list(extra_col_pair),
-            keep_parent_id=keep_parent_id,
-            unmapped_chrom=unmapped_chrom,
-            comment_char=comment_char,
-            outstream=outstream,
-            outstream_dups=outstream_dups,
-            outstream_unmapped=outstream_unmapped,
-            out_stat=out_stat,
-            backend=backend,
-            n_proc=n_proc,
-        )
-    else:
-        raise ValueError("Unknown backend")
-
-    # save statistics to a file if it was requested:
-    if out_stat:
-        out_stat.save(out_stats_stream)
-
-    if output_bytile_dups_stats:
-        out_stat.save_bytile_dups(out_bytile_dups_stats_stream)
-
-    if instream != sys.stdin:
-        instream.close()
-
-    if outstream != sys.stdout:
-        outstream.close()
-
-    if outstream_dups and (outstream_dups != outstream):
-        outstream_dups.close()
-
-    if (
-        outstream_unmapped
-        and (outstream_unmapped != outstream)
-        and (outstream_unmapped != outstream_dups)
-    ):
-        outstream_unmapped.close()
-
-    if out_stats_stream:
-        out_stats_stream.close()
-
-
-### Chunked dedup ###
-def streaming_dedup(
-    in_stream,
-    colnames,
-    chunksize,
-    carryover,
-    method,
-    mark_dups,
-    max_mismatch,
-    extra_col_pairs,
-    unmapped_chrom,
-    comment_char,
-    outstream,
-    outstream_dups,
-    outstream_unmapped,
-    keep_parent_id,
-    out_stat,
-    backend,
-    n_proc,
-):
-
-    deduped_chunks = _dedup_stream(
-        in_stream=in_stream,
-        colnames=colnames,
-        method=method,
-        chunksize=chunksize,
-        carryover=carryover,
-        mark_dups=mark_dups,
-        max_mismatch=max_mismatch,
-        extra_col_pairs=extra_col_pairs,
-        keep_parent_id=keep_parent_id,
-        comment_char=comment_char,
-        backend=backend,
-        n_proc=n_proc,
-    )
-
-    for df_chunk in deduped_chunks:
-        # Write the stats if requested:
-        if out_stat is not None:
-            out_stat.add_pairs_from_dataframe(
-                df_chunk,
-                unmapped_chrom=unmapped_chrom,
-                analyse_bytile_dups=keep_parent_id,
-            )
-
-        # Define masks of unmapped and duplicated reads:
-        mask_mapped = np.logical_and(
-            (df_chunk["chrom1"] != unmapped_chrom),
-            (df_chunk["chrom2"] != unmapped_chrom),
-        )
-        mask_duplicates = df_chunk["duplicate"]
-
-        # Clean up dataframe:
-        df_chunk = df_chunk.drop(columns=["duplicate"])
-
-        # Stream the dups:
-        if outstream_dups:
-            df_chunk.loc[mask_mapped & mask_duplicates, :].to_csv(
-                outstream_dups, index=False, header=False, sep="\t"
-            )
-
-        # Drop readID if it was created (not needed for nodup and unmapped pairs):
-        if keep_parent_id:
-            df_chunk = df_chunk.drop(columns=["parent_readID"])
-
-        # Stream unmapped:
-        if outstream_unmapped:
-            df_chunk.loc[~mask_mapped, :].to_csv(
-                outstream_unmapped, index=False, header=False, sep="\t"
-            )
-
-        # Stream unique pairs:
-        df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv(
-            outstream, index=False, header=False, sep="\t"
-        )
-
-
-def _dedup_stream(
-    in_stream,
-    colnames,
-    method,
-    chunksize,
-    carryover,
-    mark_dups,
-    max_mismatch,
-    extra_col_pairs,
-    keep_parent_id,
-    comment_char,
-    backend,
-    n_proc,
-):
-    # Stream the input dataframe:
-    dfs = pd.read_table(
-        in_stream, comment=comment_char, names=colnames, chunksize=chunksize
-    )
-
-    # Set up the carryover dataframe:
-    df_prev_nodups = pd.DataFrame([])
-    prev_i = 0
-
-    # Iterate over chunks:
-    for df in dfs:
-        df_marked = _dedup_chunk(
-            pd.concat([df_prev_nodups, df], axis=0, ignore_index=True).reset_index(
-                drop=True
-            ),
-            r=max_mismatch,
-            method=method,
-            keep_parent_id=keep_parent_id,
-            extra_col_pairs=extra_col_pairs,
-            backend=backend,
-            n_proc=n_proc,
-        )
-        df_marked = df_marked.loc[prev_i:, :].reset_index(drop=True)
-        mask_duplicated = df_marked["duplicate"]
-        if mark_dups:
-            df_marked.loc[mask_duplicated, "pair_type"] = "DD"
-
-        # Filter out duplicates and store specific columns:
-        df_nodups = df_marked.loc[~mask_duplicated, colnames]
-
-        # Re-define carryover pairs:
-        df_prev_nodups = df_nodups.tail(carryover).reset_index(drop=True)
-        prev_i = len(df_prev_nodups)
-
-        yield df_marked
-
-
-def _dedup_chunk(
-    df,
-    r,
-    method,
-    keep_parent_id,
-    extra_col_pairs,
-    backend,
-    unmapped_chrom="!",
-    n_proc=1,
-):
-    """Mark duplicates in a dataframe of pairs
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Dataframe with pairs, has to contain columns 'chrom1', 'pos1', 'chrom2', 'pos2'
-        'strand1', 'strand2'
-    r : int
-        Allowed distance between two pairs to call them duplicates
-    method : str
-        'sum' or 'max' - whether 'r' uses sum of distances on two ends of pairs, or the
-        maximal distance
-    keep_parent_id : bool
-        If True, the read ID of the read that was not labelled as a duplicate from a
-        group of duplicates is recorded for each read marked as duplicate.
-        Only possible with non-cython backends
-    extra_col_pairs : list of tuples
-        List of extra column pairs that need to match between two reads for them be
-        considered duplicates (e.g. useful if alleles are annotated)
-    backend : str
-        'scipy', 'sklearn', 'cython'
-    unmapped_chrom : str, optional
-        Which character denotes unmapped reads in the chrom1/chrom2 fields,
-        by default "!"
-    n_proc : int, optional
-        How many cores to use, by default 1
-        Only works for 'sklearn' backend
-
-    Returns
-    -------
-    pd.DataFrame
-        Dataframe with marked duplicates (extra boolean field 'duplicate'), and
-        optionally recorded 'parent_readID'
-
-    """
-    if method not in ("max", "sum"):
-        raise ValueError('Unknown method, only "sum" or "max" allowed')
-    if backend == "sklearn":
-        from sklearn import neighbors
-
-    if method == "sum":
-        p = 1
-    else:
-        p = np.inf
-
-    # Store the index of the dataframe:
-    index_colname = df.index.name
-    if index_colname is None:
-        index_colname = "index"
-    df = df.reset_index()  # Remove the index temporarily
-
-    # Set up columns to store the dedup info:
-    df.loc[:, "clusterid"] = np.nan
-    df.loc[:, "duplicate"] = False
-
-    # Split mapped and unmapped reads:
-    mask_unmapped = (df["chrom1"] == unmapped_chrom) | (df["chrom2"] == unmapped_chrom)
-    df_unmapped = df.loc[mask_unmapped, :].copy()
-    df_mapped = df.loc[~mask_unmapped, :].copy()
-    N_mapped = df_mapped.shape[0]
-
-    # If there are some mapped reads, dedup them:
-    if N_mapped > 0:
-        if backend == "sklearn":
-            a = neighbors.radius_neighbors_graph(
-                df_mapped[["pos1", "pos2"]], radius=r, p=p, n_jobs=n_proc,
-            )
-            a0, a1 = a.nonzero()
-        elif backend == "scipy":
-            z = scipy.spatial.cKDTree(df_mapped[["pos1", "pos2"]])
-            a = z.query_pairs(r=r, p=p, output_type="ndarray")
-            a0 = a[:, 0]
-            a1 = a[:, 1]
-        need_to_match = np.array(
-            [
-                ("chrom1", "chrom1"),
-                ("chrom2", "chrom2"),
-                ("strand1", "strand1"),
-                ("strand2", "strand2"),
-            ]
-            + extra_col_pairs
-        )
-        nonpos_matches = np.all(
-            [
-                df_mapped.iloc[a0, df_mapped.columns.get_loc(lc)].values
-                == df_mapped.iloc[a1, df_mapped.columns.get_loc(rc)].values
-                for (lc, rc) in need_to_match
-            ],
-            axis=0,
-        )
-        a0 = a0[nonpos_matches]
-        a1 = a1[nonpos_matches]
-        a_mat = coo_matrix((np.ones_like(a0), (a0, a1)), shape=(N_mapped, N_mapped))
-
-        # Set up inferred clusterIDs:
-        df_mapped.loc[:, "clusterid"] = connected_components(a_mat, directed=False)[1]
-
-    mask_dups = df_mapped["clusterid"].duplicated()
-    df_mapped.loc[mask_dups, "duplicate"] = True
-
-    # Mark parent IDs if requested:
-    if keep_parent_id:
-        df_mapped.loc[:, "parent_readID"] = df_mapped["clusterid"].map(
-            df_mapped[~mask_dups].set_index("clusterid")["readID"]
-        )
-        df_unmapped.loc[:, "parent_readID"] = ""
-
-    # Reconstruct original dataframe with removed duplicated reads:
-    df = pd.concat([df_unmapped, df_mapped]).reset_index(drop=True)
-    df = df.set_index(index_colname)  # Set up the original index
-    df = df.drop(
-        ["clusterid"], axis=1
-    )  # Remove the information that we don't need anymore:
-
-    return df
-
-
-def streaming_dedup_cython(
-    method,
-    max_mismatch,
-    sep,
-    c1ind,
-    c2ind,
-    p1ind,
-    p2ind,
-    s1ind,
-    s2ind,
-    extra_cols1,
-    extra_cols2,
-    unmapped_chrom,
-    instream,
-    outstream,
-    outstream_dups,
-    outstream_unmapped,
-    out_stat,
-    mark_dups,
-    keep_parent_id=False,
-    readid_ind=0,
-):
-    """
-    Cython-powered deduplication with online algorithm based on indexed list.
-
-    Parameters
-    ----------
-    method: "max" or "sum"
-    max_mismatch: maximum allowed mismatch to count the pairs as duplicates
-    sep: separator of the fields in the input file
-    c1ind: index of the chr1 column
-    c2ind: index of the chr2 column
-    p1ind: index of the pos1 column
-    p2ind: index of the pos2 column
-    s1ind: index of the strand1 column
-    s2ind: index of the strand2 column
-    extra_cols1: extra columns for left alignment in a pair to add
-    extra_cols2: extra columns for right alignment in a pair to add
-    unmapped_chrom: Symbol of the chromosome for the unmapped alignment
-    instream: input stream of pair file
-    outstream: output stram of deduplicated pairs
-    outstream_dups: output stream of duplicates (optionally with added parent_id, see keep_parent_id option)
-    outstream_unmapped: output stram of unmapped pairs
-    out_stat: output statistics
-    mark_dups: if True, will add "DD" as the pair_type
-    keep_parent_id: if True, additional column "parent_readID will be added to the output, can be useful for optical duplicates search
-    readid_ind: index of the readID column in the input file
-
-    Returns
-    -------
-
-    """
-    maxind = max(c1ind, c2ind, p1ind, p2ind, s1ind, s2ind)
-    if bool(extra_cols1) and bool(extra_cols2):
-        maxind = max(maxind, max(extra_cols1), max(extra_cols2))
-
-    all_scols1 = [s1ind] + extra_cols1
-    all_scols2 = [s2ind] + extra_cols2
-
-    # if we do stats in the dedup, we need PAIR_TYPE
-    # i do not see way around this:
-    if out_stat:
-        ptind = _pairsam_format.COL_PTYPE
-        maxind = max(maxind, ptind)
-
-    dd = _dedup.OnlineDuplicateDetector(
-        method, max_mismatch, returnData=False, keep_parent_id=keep_parent_id
-    )
-
-    c1 = []
-    c2 = []
-    p1 = []
-    p2 = []
-    s1 = []
-    s2 = []
-    idx = []
-    line_buffer = []
-    cols_buffer = []
-    chromDict = {}
-    strandDict = {}
-    curMaxLen = max(MAX_LEN, dd.getLen())
-
-    instream = iter(instream)
-    read_idx = 0  # read index to mark the parent readID
-    while True:
-        rawline = next(instream, None)
-        stripline = rawline.strip() if rawline else None
-
-        # take care of empty lines not at the end of the file separately
-        if rawline and (not stripline):
-            warnings.warn("Empty line detected not at the end of the file")
-            continue
-
-        if stripline:
-            cols = stripline.split(sep)
-            if len(cols) <= maxind:
-                raise ValueError(
-                    "Error parsing line {}: ".format(stripline)
-                    + " expected {} columns, got {}".format(maxind, len(cols))
-                )
-
-            if (cols[c1ind] == unmapped_chrom) or (cols[c2ind] == unmapped_chrom):
-
-                if outstream_unmapped:
-                    outstream_unmapped.write(stripline)
-                    # don't forget terminal newline
-                    outstream_unmapped.write("\n")
-
-                # add a pair to PairCounter if stats output is requested:
-                if out_stat:
-                    out_stat.add_pair(
-                        cols[c1ind],
-                        int(cols[p1ind]),
-                        cols[s1ind],
-                        cols[c2ind],
-                        int(cols[p2ind]),
-                        cols[s2ind],
-                        cols[ptind],
-                    )
-            else:
-                line_buffer.append(stripline)
-                cols_buffer.append(cols)
-
-                c1.append(fetchadd(cols[c1ind], chromDict))
-                c2.append(fetchadd(cols[c2ind], chromDict))
-                p1.append(int(cols[p1ind]))
-                p2.append(int(cols[p2ind]))
-
-                idx.append(read_idx)
-                read_idx += 1
-
-                if bool(extra_cols1) and bool(extra_cols2):
-                    s1.append(
-                        fetchadd("".join(cols[i] for i in all_scols1), strandDict)
-                    )
-                    s2.append(
-                        fetchadd("".join(cols[i] for i in all_scols2), strandDict)
-                    )
-                else:
-                    s1.append(fetchadd(cols[s1ind], strandDict))
-                    s2.append(fetchadd(cols[s2ind], strandDict))
-        if (not stripline) or (len(c1) == curMaxLen):
-            if keep_parent_id:
-                res, parents = dd.push(
-                    ar(c1, 32),
-                    ar(c2, 32),
-                    ar(p1, 32),
-                    ar(p2, 32),
-                    ar(s1, 32),
-                    ar(s2, 32),
-                )
-
-            else:
-                res = dd.push(
-                    ar(c1, 32),
-                    ar(c2, 32),
-                    ar(p1, 32),
-                    ar(p2, 32),
-                    ar(s1, 32),
-                    ar(s2, 32),
-                )
-
-            if not stripline:
-                if keep_parent_id:
-                    res_tmp, parents_tmp = dd.finish()
-                    parents = np.concatenate([parents, parents_tmp])
-
-                else:
-                    res_tmp = dd.finish()
-                res = np.concatenate([res, res_tmp])
-
-            for i in range(len(res)):
-                # not duplicated pair:
-                if not res[i]:
-                    outstream.write(line_buffer[i])
-                    # don't forget terminal newline
-                    outstream.write("\n")
-                    if out_stat:
-                        out_stat.add_pair(
-                            cols_buffer[i][c1ind],
-                            int(cols_buffer[i][p1ind]),
-                            cols_buffer[i][s1ind],
-                            cols_buffer[i][c2ind],
-                            int(cols_buffer[i][p2ind]),
-                            cols_buffer[i][s2ind],
-                            cols_buffer[i][ptind],
-                        )
-                # duplicated pair:
-                else:
-                    if out_stat:
-                        out_stat.add_pair(
-                            cols_buffer[i][c1ind],
-                            int(cols_buffer[i][p1ind]),
-                            cols_buffer[i][s1ind],
-                            cols_buffer[i][c2ind],
-                            int(cols_buffer[i][p2ind]),
-                            cols_buffer[i][s2ind],
-                            "DD",
-                        )
-                    if outstream_dups:
-                        if mark_dups:  # DD-marked pair:
-                            output = sep.join(mark_split_pair_as_dup(cols_buffer[i]))
-                        else:  # pair as is:
-                            output = line_buffer[i]
-
-                        if keep_parent_id:  # Add parentID as the last column:
-                            parent_readID = line_buffer[parents[i]].split(sep)[
-                                readid_ind
-                            ]
-                            output = sep.join([output, parent_readID])
-
-                        outstream_dups.write(output)
-
-                        # don't forget terminal newline
-                        outstream_dups.write("\n")
-
-            # flush buffers and perform necessary checks here:
-            c1 = []
-            c2 = []
-            p1 = []
-            p2 = []
-            s1 = []
-            s2 = []
-            line_buffer = line_buffer[len(res) :]
-            cols_buffer = cols_buffer[len(res) :]
-            if not stripline:
-                if len(line_buffer) != 0:
-                    raise ValueError(
-                        "{} lines left in the buffer, ".format(len(line_buffer))
-                        + "should be none;"
-                        + "something went terribly wrong"
-                    )
-                break
-        # process next line ...
-
-    # all lines have been processed at this point.
-    # streaming_dedup is over.
-
-
-def fetchadd(key, mydict):
-    key = key.strip()
-    if key not in mydict:
-        mydict[key] = len(mydict)
-    return mydict[key]
-
-
-def ar(mylist, val):
-    return np.array(mylist, dtype={8: np.int8, 16: np.int16, 32: np.int32}[val])
diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
deleted file mode 100755
index f0dd71cd..00000000
--- a/pairtools/pairtools_stats.py
+++ /dev/null
@@ -1,835 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import io
-import sys
-import click
-import warnings
-
-import numpy as np
-import pandas as pd
-from scipy import special
-
-from collections.abc import Mapping
-
-from . import _fileio, _pairsam_format, cli, _headerops, common_io_options
-
-UTIL_NAME = "pairtools_stats"
-
-
-@cli.command()
-@click.argument("input_path", type=str, nargs=-1, required=False)
-@click.option("-o", "--output", type=str, default="", help="output stats tsv file.")
-@click.option(
-    "--merge",
-    is_flag=True,
-    help="If specified, merge multiple input stats files instead of calculating"
-    " statistics of a .pairs/.pairsam file. Merging is performed via summation of"
-    " all overlapping statistics. Non-overlapping statistics are appended to"
-    " the end of the file.",
-)
-@click.option(
-    "--bytile-dups/--no-bytile-dups",
-    default=False,
-    help="If enabled, will analyse by-tile duplication statistics to estimate"
-    " library complexity more accurately."
-    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
-)
-@click.option(
-    "--output-bytile-dups-stats",
-    type=str,
-    default="",
-    help="If specified, will analyse by-tile duplication statistics to estimate"
-    " library complexity more accurately and will save details to this path."
-    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
-)
-@common_io_options
-def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs):
-    """Calculate pairs statistics.
-
-    INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
-    If not provided, the input is read from stdin.
-    If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number
-    of stats files to merge.
-
-    The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
-    """
-    stats_py(
-        input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs,
-    )
-
-
-def stats_py(
-    input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs
-):
-    if merge:
-        do_merge(output, input_path, **kwargs)
-        return
-
-    instream = (
-        _fileio.auto_open(
-            input_path[0],
-            mode="r",
-            nproc=kwargs.get("nproc_in"),
-            command=kwargs.get("cmd_in", None),
-        )
-        if input_path
-        else sys.stdin
-    )
-    outstream = (
-        _fileio.auto_open(
-            output,
-            mode="w",
-            nproc=kwargs.get("nproc_out"),
-            command=kwargs.get("cmd_out", None),
-        )
-        if output
-        else sys.stdout
-    )
-
-    header, body_stream = _headerops.get_header(instream)
-    cols = _headerops.extract_column_names(header)
-
-    if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols:
-        warnings.warn(
-            "No 'parent_readID' column in the file, not generating duplicate stats."
-        )
-        bytile_dups = False
-        output_bytile_dups_stats = False
-    # new stats class stuff would come here ...
-    stats = PairCounter()
-
-    # Collecting statistics
-    for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):
-        stats.add_pairs_from_dataframe(chunk)
-
-    if output_bytile_dups_stats:
-        stats.save_bytile_dups()
-    stats.calculate_summaries()
-
-    # save statistics to file ...
-    stats.save(outstream)
-
-    if instream != sys.stdin:
-        instream.close()
-    if outstream != sys.stdout:
-        outstream.close()
-
-
-def do_merge(output, files_to_merge, **kwargs):
-    # Parse all stats files.
-    stats = []
-    for stat_file in files_to_merge:
-        f = _fileio.auto_open(
-            stat_file,
-            mode="r",
-            nproc=kwargs.get("nproc_in"),
-            command=kwargs.get("cmd_in", None),
-        )
-        # use a factory method to instanciate PairCounter
-        stat = PairCounter.from_file(f)
-        stats.append(stat)
-        f.close()
-
-    # combine stats from several files (files_to_merge):
-    out_stat = sum(stats)
-
-    # Save merged stats.
-    outstream = _fileio.auto_open(
-        output,
-        mode="w",
-        nproc=kwargs.get("nproc_out"),
-        command=kwargs.get("cmd_out", None),
-    )
-
-    # save statistics to file ...
-    out_stat.save(outstream)
-
-    if outstream != sys.stdout:
-        outstream.close()
-
-
-def estimate_library_complexity(nseq, ndup, nopticaldup=0):
-    """Estimate library complexity accounting for optical/clustering duplicates
-
-    Parameters
-    ----------
-    nseq : int
-        Total number of sequences
-    ndup : int
-        Total number of duplicates
-    nopticaldup : int, optional
-        Number of non-PCR duplicates, by default 0
-
-    Returns
-    -------
-    float
-        Estimated complexity
-    """
-    nseq = nseq - nopticaldup
-    if nseq == 0:
-        warnings.warn("Empty of fully duplicated library, can't estimate complexity")
-        return 0
-    ndup = ndup - nopticaldup
-    u = (nseq - ndup) / nseq
-    seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u
-    complexity = nseq / seq_to_complexity
-    return complexity
-
-
-def extract_tile_info(series, regex=False):
-    """Extract the name of the tile for each read name in the series
-
-    Parameters
-    ----------
-    series : pd.Series
-        Series containing read IDs
-    regex : bool, optional
-        Regex to extract fields from the read IDs that correspond to tile IDs.
-        By default False, uses a faster predefined approach for typical Illumina
-        read names
-        Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)"
-
-    Returns
-    -------
-    Series
-        Series containing tile IDs as strings
-    """
-    if regex:
-        split = series.str.extractall(regex).unstack().droplevel(1, axis=1)
-        return split[0] + ":" + split[1] + ":" + split[2]
-    else:
-        split = series.str.split(":", expand=True)
-        return split[2] + ":" + split[3] + ":" + split[4]
-
-
-def analyse_duplicate_stats(dups, tile_dup_regex=False):
-    """Count by-tile duplicates
-
-    Parameters
-    ----------
-    dups : pd.DataFrame
-        Dataframe with duplicates that contains pared read IDs
-    tile_dup_regex : bool, optional
-        See extract_tile_info for details, by default False
-
-    Returns
-    -------
-    pd.DataFrame
-        Grouped multi-indexed dataframe of pairwise by-tile duplication counts
-    """
-    dups = dups.copy()
-    dups["tile"] = extract_tile_info(dups["readID"])
-    dups["parent_tile"] = extract_tile_info(dups["parent_readID"])
-    dups["same_tile"] = dups["tile"] == dups["parent_tile"]
-    bytile_dups = (
-        dups.groupby(["tile", "parent_tile"])
-        .size()
-        .reset_index(name="dup_count")
-        .sort_values(["tile", "parent_tile"])
-    )
-    bytile_dups[["tile", "parent_tile"]] = np.sort(
-        bytile_dups[["tile", "parent_tile"]].values, axis=1
-    )
-    bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum()
-    return bytile_dups
-
-
-class PairCounter(Mapping):
-    """
-    A Counter for Hi-C pairs that accumulates various statistics.
-
-    PairCounter implements two interfaces to access multi-level statistics:
-    1. as a nested dict, e.g. pairCounter['pair_types']['LL']
-    2. as a flat dict, with the level keys separated by '/', e.g. pairCounter['pair_types/LL']
-
-    Other features:
-    -- PairCounters can be saved into/loaded from a file
-    -- multiple PairCounters can be merged via addition.
-    """
-
-    _SEP = "\t"
-    _KEY_SEP = "/"
-
-    def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25):
-        self._stat = {}
-        # some variables used for initialization:
-        # genomic distance bining for the ++/--/-+/+- distribution
-        self._dist_bins = np.r_[
-            0,
-            np.round(
-                10
-                ** np.arange(
-                    min_log10_dist, max_log10_dist + 0.001, log10_dist_bin_step
-                )
-            ).astype(np.int),
-        ]
-
-        # establish structure of an empty _stat:
-        self._stat["total"] = 0
-        self._stat["total_unmapped"] = 0
-        self._stat["total_single_sided_mapped"] = 0
-        # total_mapped = total_dups + total_nodups
-        self._stat["total_mapped"] = 0
-        self._stat["total_dups"] = 0
-        self._stat["total_nodups"] = 0
-        ########################################
-        # the rest of stats are based on nodups:
-        ########################################
-        self._stat["cis"] = 0
-        self._stat["trans"] = 0
-        self._stat["pair_types"] = {}
-        # to be removed:
-        self._stat["dedup"] = {}
-
-        self._stat["cis_1kb+"] = 0
-        self._stat["cis_2kb+"] = 0
-        self._stat["cis_4kb+"] = 0
-        self._stat["cis_10kb+"] = 0
-        self._stat["cis_20kb+"] = 0
-        self._stat["cis_40kb+"] = 0
-
-        self._stat["chrom_freq"] = {}
-
-        self._stat["dist_freq"] = {
-            "+-": np.zeros(len(self._dist_bins), dtype=np.int),
-            "-+": np.zeros(len(self._dist_bins), dtype=np.int),
-            "--": np.zeros(len(self._dist_bins), dtype=np.int),
-            "++": np.zeros(len(self._dist_bins), dtype=np.int),
-        }
-        # Summaries are derived from other stats and are recalculated on merge
-        self._stat["summary"] = dict(
-            [
-                ("frac_cis", 0),
-                ("frac_cis_1kb+", 0),
-                ("frac_cis_2kb+", 0),
-                ("frac_cis_4kb+", 0),
-                ("frac_cis_10kb+", 0),
-                ("frac_cis_20kb+", 0),
-                ("frac_cis_40kb+", 0),
-                ("frac_dups", 0),
-                ("complexity_naive", 0),
-            ]
-        )
-        self.bytile_dups = pd.DataFrame(
-            index=pd.MultiIndex(
-                levels=[[], []], codes=[[], []], names=["tile", "parent_tile"]
-            )
-        )
-
-    def __getitem__(self, key):
-        if isinstance(key, str):
-            # let's strip any unintentional '/'
-            # from either side of the key
-            key = key.strip("/")
-            if self._KEY_SEP in key:
-                # multi-key to access nested elements
-                k_fields = key.split(self._KEY_SEP)
-            else:
-                # single-key access flat part of PairCounter
-                # or to access highest level of hierarchy
-                return self._stat[key]
-        else:
-            # clearly an error:
-            raise ValueError(f"{key} is not a valid key: must be str")
-
-        # K_FIELDS:
-        # process multi-key case:
-        # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup']
-        # get the first 'k' and keep the remainders in 'k_fields'
-        k = k_fields.pop(0)
-        if k in ["pair_types", "dedup"]:
-            # assert there is only one element in key_fields left:
-            # 'pair_types' and 'dedup' treated the same
-            if len(k_fields) == 1:
-                return self._stat[k][k_fields[0]]
-            else:
-                raise ValueError(
-                    f"{key} is not a valid key: {k} section implies 1 identifier"
-                )
-
-        elif k == "chrom_freq":
-            # assert remaining key_fields == [chr1, chr2]:
-            if len(k_fields) == 2:
-                return self._stat[k][tuple(k_fields)]
-            else:
-                raise ValueError(
-                    f"{key} is not a valid key: {k} section implies 2 identifiers"
-                )
-
-        elif k == "dist_freq":
-            # assert that last element of key_fields is the 'directions'
-            # THIS IS DONE FOR CONSISTENCY WITH .stats FILE
-            # SHOULD THAT BE CHANGED IN .stats AND HERE AS WELL?
-            if len(k_fields) == 2:
-                # assert 'dirs' in ['++','--','+-','-+']
-                dirs = k_fields.pop()
-                # there is only genomic distance range of the bin that's left:
-                (bin_range,) = k_fields
-                # extract left border of the bin "1000000+" or "1500-6000":
-                dist_bin_left = (
-                    bin_range.strip("+")
-                    if bin_range.endswith("+")
-                    else bin_range.split("-")[0]
-                )
-                # get the index of that bin:
-                bin_idx = (
-                    np.searchsorted(self._dist_bins, int(dist_bin_left), "right") - 1
-                )
-                # store corresponding value:
-                return self._stat["dist_freq"][dirs][bin_idx]
-            else:
-                raise ValueError(
-                    f"{key} is not a valid key: {k} section implies 2 identifiers"
-                )
-        else:
-            raise ValueError(f"{k} is not a valid key")
-
-    def __iter__(self):
-        return iter(self._stat)
-
-    def __len__(self):
-        return len(self._stat)
-
-    def calculate_summaries(self):
-        """calculate summary statistics (fraction of cis pairs at different cutoffs,
-        complexity estimate) based on accumulated counts. Results are saved into
-        self._stat['summary']
-
-        """
-        for cis_count in (
-            "cis",
-            "cis_1kb+",
-            "cis_2kb+",
-            "cis_4kb+",
-            "cis_10kb+",
-            "cis_20kb+",
-            "cis_40kb+",
-        ):
-            self._stat["summary"][f"frac_{cis_count}"] = (
-                (self._stat[cis_count] / self._stat["total_nodups"])
-                if self._stat["total_nodups"] > 0
-                else 0
-            )
-        self._stat["summary"]["frac_dups"] = (
-            (self._stat["total_dups"] / self._stat["total_mapped"])
-            if self._stat["total_mapped"] > 0
-            else 0
-        )
-        self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
-            self._stat["total_mapped"], self._stat["total_dups"], 0
-        )
-        if self.bytile_dups.shape[0] > 0:
-            self._stat["dups_by_tile_median"] = (
-                self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0]
-            )
-        if "dups_by_tile_median" in self._stat:
-            self._stat["summary"][
-                "complexity_dups_by_tile_median"
-            ] = estimate_library_complexity(
-                self._stat["total_mapped"],
-                self._stat["total_dups"],
-                self._stat["dups_by_tile_median"],
-            )
-
-    @classmethod
-    def from_file(cls, file_handle):
-        """create instance of PairCounter from file
-
-        Parameters
-        ----------
-        file_handle: file handle
-
-        Returns
-        -------
-        PairCounter
-            new PairCounter filled with the contents of the input file
-        """
-        # fill in from file - file_handle:
-        stat_from_file = cls()
-        for l in file_handle:
-            fields = l.strip().split(cls._SEP)
-            if len(fields) == 0:
-                # skip empty lines:
-                continue
-            if len(fields) != 2:
-                # expect two _SEP separated values per line:
-                raise _fileio.ParseError(
-                    "{} is not a valid stats file".format(file_handle.name)
-                )
-            # extract key and value, then split the key:
-            putative_key, putative_val = fields[0], fields[1]
-            key_fields = putative_key.split(cls._KEY_SEP)
-            # we should impose a rigid structure of .stats or redo it:
-            if len(key_fields) == 1:
-                key = key_fields[0]
-                if key in stat_from_file._stat:
-                    stat_from_file._stat[key] = int(fields[1])
-                else:
-                    raise _fileio.ParseError(
-                        "{} is not a valid stats file: unknown field {} detected".format(
-                            file_handle.name, key
-                        )
-                    )
-            else:
-                # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup', 'summary']
-                # get the first 'key' and keep the remainders in 'key_fields'
-                key = key_fields.pop(0)
-                if key in ["pair_types", "dedup", "summary"]:
-                    # assert there is only one element in key_fields left:
-                    # 'pair_types' and 'dedup' treated the same
-                    if len(key_fields) == 1:
-                        try:
-                            stat_from_file._stat[key][key_fields[0]] = int(fields[1])
-                        except ValueError:
-                            stat_from_file._stat[key][key_fields[0]] = float(fields[1])
-                    else:
-                        raise _fileio.ParseError(
-                            "{} is not a valid stats file: {} section implies 1 identifier".format(
-                                file_handle.name, key
-                            )
-                        )
-
-                elif key == "chrom_freq":
-                    # assert remaining key_fields == [chr1, chr2]:
-                    if len(key_fields) == 2:
-                        stat_from_file._stat[key][tuple(key_fields)] = int(fields[1])
-                    else:
-                        raise _fileio.ParseError(
-                            "{} is not a valid stats file: {} section implies 2 identifiers".format(
-                                file_handle.name, key
-                            )
-                        )
-
-                elif key == "dist_freq":
-                    # assert that last element of key_fields is the 'directions'
-                    if len(key_fields) == 2:
-                        # assert 'dirs' in ['++','--','+-','-+']
-                        dirs = key_fields.pop()
-                        # there is only genomic distance range of the bin that's left:
-                        (bin_range,) = key_fields
-                        # extract left border of the bin "1000000+" or "1500-6000":
-                        dist_bin_left = (
-                            bin_range.strip("+")
-                            if bin_range.endswith("+")
-                            else bin_range.split("-")[0]
-                        )
-                        # get the index of that bin:
-                        bin_idx = (
-                            np.searchsorted(
-                                stat_from_file._dist_bins, int(dist_bin_left), "right"
-                            )
-                            - 1
-                        )
-                        # store corresponding value:
-                        stat_from_file._stat[key][dirs][bin_idx] = int(fields[1])
-                    else:
-                        raise _fileio.ParseError(
-                            "{} is not a valid stats file: {} section implies 2 identifiers".format(
-                                file_handle.name, key
-                            )
-                        )
-                else:
-                    raise _fileio.ParseError(
-                        "{} is not a valid stats file: unknown field {} detected".format(
-                            file_handle.name, key
-                        )
-                    )
-        # return PairCounter from a non-empty dict:
-        return stat_from_file
-
-    def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
-        """Gather statistics for a Hi-C pair and add to the PairCounter.
-
-        Parameters
-        ----------
-        chrom1: str
-            chromosome of the first read
-        pos1: int
-            position of the first read
-        strand1: str
-            strand of the first read
-        chrom2: str
-            chromosome of the first read
-        pos2: int
-            position of the first read
-        strand2: str
-            strand of the first read
-        pair_type: str
-            type of the mapped pair of reads
-        """
-
-        self._stat["total"] += 1
-        # collect pair type stats including DD:
-        self._stat["pair_types"][pair_type] = (
-            self._stat["pair_types"].get(pair_type, 0) + 1
-        )
-        if chrom1 == "!" and chrom2 == "!":
-            self._stat["total_unmapped"] += 1
-        elif chrom1 != "!" and chrom2 != "!":
-            self._stat["total_mapped"] += 1
-            # only mapped ones can be duplicates:
-            if pair_type == "DD":
-                self._stat["total_dups"] += 1
-            else:
-                self._stat["total_nodups"] += 1
-                self._stat["chrom_freq"][(chrom1, chrom2)] = (
-                    self._stat["chrom_freq"].get((chrom1, chrom2), 0) + 1
-                )
-
-                if chrom1 == chrom2:
-                    self._stat["cis"] += 1
-                    dist = np.abs(pos2 - pos1)
-                    bin_idx = np.searchsorted(self._dist_bins, dist, "right") - 1
-                    self._stat["dist_freq"][strand1 + strand2][bin_idx] += 1
-                    if dist >= 1000:
-                        self._stat["cis_1kb+"] += 1
-                    if dist >= 2000:
-                        self._stat["cis_2kb+"] += 1
-                    if dist >= 4000:
-                        self._stat["cis_4kb+"] += 1
-                    if dist >= 10000:
-                        self._stat["cis_10kb+"] += 1
-                    if dist >= 20000:
-                        self._stat["cis_20kb+"] += 1
-                    if dist >= 40000:
-                        self._stat["cis_40kb+"] += 1
-
-                else:
-                    self._stat["trans"] += 1
-        else:
-            self._stat["total_single_sided_mapped"] += 1
-
-    def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False):
-        """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter.
-    
-        Parameters
-        ----------
-        df: pd.DataFrame
-            DataFrame with pairs. Needs to have columns:
-                'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type'
-        """
-
-        total_count = df.shape[0]
-        self._stat["total"] += total_count
-
-        # collect pair type stats including DD:
-        for pair_type, type_count in df["pair_type"].value_counts().items():
-            self._stat["pair_types"][pair_type] = (
-                self._stat["pair_types"].get(pair_type, 0) + type_count
-            )
-
-        # Count the unmapped by the "unmapped" chromosomes (debatable, as WW are also marked as ! and they might be mapped):
-        unmapped_count = np.logical_and(
-            df["chrom1"] == unmapped_chrom, df["chrom2"] == unmapped_chrom
-        ).sum()
-        self._stat["total_unmapped"] += int(unmapped_count)
-
-        # Count the mapped:
-        df_mapped = df.loc[
-            (df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), :
-        ]
-        mapped_count = df_mapped.shape[0]
-
-        self._stat["total_mapped"] += mapped_count
-        self._stat["total_single_sided_mapped"] += int(
-            total_count - (mapped_count + unmapped_count)
-        )
-
-        # Count the duplicates:
-        if "duplicate" in df_mapped.columns:
-            mask_dups = df_mapped["duplicate"]
-        else:
-            mask_dups = df_mapped["pair_type"] == "DD"
-        dups = df_mapped[mask_dups]
-        dups_count = dups.shape[0]
-        self._stat["total_dups"] += int(dups_count)
-        self._stat["total_nodups"] += int(mapped_count - dups_count)
-
-        df_nodups = df_mapped.loc[~mask_dups, :]
-        mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"]
-        df_cis = df_nodups.loc[mask_cis, :].copy()
-
-        # Count pairs per chromosome:
-        for (chrom1, chrom2), chrom_count in (
-            df_nodups[["chrom1", "chrom2"]].value_counts().items()
-        ):
-            self._stat["chrom_freq"][(chrom1, chrom2)] = (
-                self._stat["chrom_freq"].get((chrom1, chrom2), 0) + chrom_count
-            )
-
-        # Count cis-trans by pairs:
-
-        self._stat["cis"] += df_cis.shape[0]
-        self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0]
-        dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values)
-
-        df_cis.loc[:, "bin_idx"] = np.searchsorted(self._dist_bins, dist, "right") - 1
-        for (strand1, strand2, bin_id), strand_bin_count in (
-            df_cis[["strand1", "strand2", "bin_idx"]].value_counts().items()
-        ):
-            self._stat["dist_freq"][strand1 + strand2][bin_id] += strand_bin_count
-        self._stat["cis_1kb+"] += int(np.sum(dist >= 1000))
-        self._stat["cis_2kb+"] += int(np.sum(dist >= 2000))
-        self._stat["cis_4kb+"] += int(np.sum(dist >= 4000))
-        self._stat["cis_10kb+"] += int(np.sum(dist >= 10000))
-        self._stat["cis_20kb+"] += int(np.sum(dist >= 20000))
-        self._stat["cis_40kb+"] += int(np.sum(dist >= 40000))
-
-        ### Add by-tile dups
-        if bytile_dups and dups.shape[0] > 0:
-            self.bytile_dups = self.bytile_dups.add(
-                analyse_duplicate_stats(dups), fill_value=0
-            ).astype(int)
-
-    def __add__(self, other):
-        # both PairCounter are implied to have a list of common fields:
-        #
-        # 'total', 'total_unmapped', 'total_single_sided_mapped', 'total_mapped',
-        # 'cis', 'trans', 'pair_types', 'cis_1kb+', 'cis_2kb+',
-        # 'cis_10kb+', 'cis_20kb+', 'chrom_freq', 'dist_freq', 'dedup'
-        #
-        # initialize empty PairCounter for the result of summation:
-        sum_stat = PairCounter()
-        # use the empty PairCounter to iterate over:
-        for k, v in sum_stat._stat.items():
-            # not nested fields are summed trivially:
-            if isinstance(v, int):
-                sum_stat._stat[k] = self._stat[k] + other._stat[k]
-            # sum nested dicts/arrays in a context dependet manner:
-            else:
-                if k in ["pair_types", "dedup"]:
-                    # handy function for summation of a pair of dicts:
-                    # https://stackoverflow.com/questions/10461531/merge-and-sum-of-two-dictionaries
-                    sum_dicts = lambda dict_x, dict_y: {
-                        key: dict_x.get(key, 0) + dict_y.get(key, 0)
-                        for key in set(dict_x) | set(dict_y)
-                    }
-                    # sum a pair of corresponding dicts:
-                    sum_stat._stat[k] = sum_dicts(self._stat[k], other._stat[k])
-                if k == "chrom_freq":
-                    # union list of keys (chr1,chr2) with potential duplicates:
-                    union_keys_with_dups = list(self._stat[k].keys()) + list(
-                        other._stat[k].keys()
-                    )
-                    # dict.fromkeys will take care of keys' order and duplicates in a consistent manner:
-                    # https://stackoverflow.com/questions/1720421/how-to-concatenate-two-lists-in-python
-                    # last comment to the 3rd Answer
-                    sum_stat._stat[k] = dict.fromkeys(union_keys_with_dups)
-                    # perform a summation:
-                    for union_key in sum_stat._stat[k]:
-                        sum_stat._stat[k][union_key] = self._stat[k].get(
-                            union_key, 0
-                        ) + other._stat[k].get(union_key, 0)
-                if k == "dist_freq":
-                    for dirs in sum_stat[k]:
-                        sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs]
-        sum_stat.calculate_summaries()
-        sum_stat["summary"]["complexity_naive"] = (
-            self._stat["summary"]["complexity_naive"]
-            + other._stat["summary"]["complexity_naive"]
-        )
-        if (
-            "complexity_dups_by_tile_median" in self._stat
-            and "complexity_dups_by_tile_median" in other._stat
-        ):
-            sum_stat["summary"]["complexity_dups_by_tile_median"] = (
-                self._stat["summary"]["complexity_dups_by_tile_median"]
-                + other._stat["summary"]["complexity_dups_by_tile_median"]
-            )
-        # self.bytile_dups.add(other.bytile_dups, fill_value=0).astype(int)
-        return sum_stat
-
-    # we need this to be able to sum(list_of_PairCounters)
-    def __radd__(self, other):
-        if other == 0:
-            return self
-        else:
-            return self.__add__(other)
-
-    def flatten(self):
-
-        """return a flattened dict (formatted same way as .stats file)
-
-        """
-        # dict for flat store:
-        flat_stat = {}
-
-        # Storing statistics
-        for k, v in self._stat.items():
-            if isinstance(v, int):
-                flat_stat[k] = v
-            # store nested dicts/arrays in a context dependet manner:
-            # nested categories are stored only if they are non-trivial
-            else:
-                if (k == "dist_freq") and v:
-                    for i in range(len(self._dist_bins)):
-                        for dirs, freqs in v.items():
-                            # last bin is treated differently: "100000+" vs "1200-3000":
-                            if i != len(self._dist_bins) - 1:
-                                formatted_key = self._KEY_SEP.join(
-                                    ["{}", "{}-{}", "{}"]
-                                ).format(
-                                    k, self._dist_bins[i], self._dist_bins[i + 1], dirs
-                                )
-                            else:
-                                formatted_key = self._KEY_SEP.join(
-                                    ["{}", "{}+", "{}"]
-                                ).format(k, self._dist_bins[i], dirs)
-                            # store key,value pair:
-                            flat_stat[formatted_key] = freqs[i]
-                elif (k in ["pair_types", "dedup", "summary"]) and v:
-                    # 'pair_types', 'dedup' and 'summary' are simple dicts inside,
-
-                    # treat them the exact same way:
-                    for k_item, freq in v.items():
-                        formatted_key = self._KEY_SEP.join(["{}", "{}"]).format(
-                            k, k_item
-                        )
-                        # store key,value pair:
-                        flat_stat[formatted_key] = freq
-                elif (k == "chrom_freq") and v:
-                    for (chrom1, chrom2), freq in v.items():
-                        formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format(
-                            k, chrom1, chrom2
-                        )
-                        # store key,value pair:
-                        flat_stat[formatted_key] = freq
-
-        # return flattened dict
-        return flat_stat
-
-    def save(self, outstream):
-        """save PairCounter to tab-delimited text file.
-        Flattened version of PairCounter is stored in the file.
-
-        Parameters
-        ----------
-        outstream: file handle
-
-
-        Note
-        ----
-        The order of the keys is not guaranteed
-        Merging several .stats is not associative with respect to key order:
-        merge(A,merge(B,C)) != merge(merge(A,B),C).
-
-        Theys shou5ld match exactly, however, when soprted:
-        sort(merge(A,merge(B,C))) == sort(merge(merge(A,B),C))
-        """
-
-        # write flattened version of the PairCounter to outstream
-        for k, v in self.flatten().items():
-            outstream.write("{}{}{}\n".format(k, self._SEP, v))
-
-    def save_bytile_dups(self, outstream):
-        """save bytile duplication counts to a tab-delimited text file.
-
-        Parameters
-        ----------
-        outstream: file handle
-        """
-        self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False)
-
-
-if __name__ == "__main__":
-    stats()

From 034b8d0d8c71be4ced4ea8f9c8e60f1f61c7d336 Mon Sep 17 00:00:00 2001
From: Ilya Flyamer <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 15:17:56 +0200
Subject: [PATCH 12/20] Test always

---
 .github/workflows/python-package.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 54fffe4b..da3f924f 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -4,10 +4,8 @@
 name: Python package
 
 on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
+  push
+  pull_request
 
 jobs:
   build:

From bc647ba43b31c76dc337b136a07babf830db49e4 Mon Sep 17 00:00:00 2001
From: Ilya Flyamer <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 15:22:07 +0200
Subject: [PATCH 13/20] fix testing

---
 .github/workflows/python-package.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index da3f924f..92d47fde 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -3,9 +3,7 @@
 
 name: Python package
 
-on:
-  push
-  pull_request
+on: [push, pull_request]
 
 jobs:
   build:

From c356d7432bad3d04b3433b036419ccc440e5ac65 Mon Sep 17 00:00:00 2001
From: Ilya Flyamer <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 15:22:58 +0200
Subject: [PATCH 14/20] Avoid double testing in PRs

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 92d47fde..cb1e04f9 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -3,7 +3,7 @@
 
 name: Python package
 
-on: [push, pull_request]
+on: push
 
 jobs:
   build:

From 1654e8c2066c732a37e89dc690701d3817420c9d Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 15:28:56 +0200
Subject: [PATCH 15/20] Fix stats

---
 pairtools/lib/stats.py | 120 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py
index 2b9ab373..5e0e128f 100644
--- a/pairtools/lib/stats.py
+++ b/pairtools/lib/stats.py
@@ -37,6 +37,86 @@ def do_merge(output, files_to_merge, **kwargs):
         outstream.close()
 
 
+def estimate_library_complexity(nseq, ndup, nopticaldup=0):
+    """Estimate library complexity accounting for optical/clustering duplicates
+    Parameters
+    ----------
+    nseq : int
+        Total number of sequences
+    ndup : int
+        Total number of duplicates
+    nopticaldup : int, optional
+        Number of non-PCR duplicates, by default 0
+    Returns
+    -------
+    float
+        Estimated complexity
+    """
+    nseq = nseq - nopticaldup
+    if nseq == 0:
+        warnings.warn("Empty of fully duplicated library, can't estimate complexity")
+        return 0
+    ndup = ndup - nopticaldup
+    u = (nseq - ndup) / nseq
+    seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u
+    complexity = nseq / seq_to_complexity
+    return complexity
+
+
+def extract_tile_info(series, regex=False):
+    """Extract the name of the tile for each read name in the series
+    Parameters
+    ----------
+    series : pd.Series
+        Series containing read IDs
+    regex : bool, optional
+        Regex to extract fields from the read IDs that correspond to tile IDs.
+        By default False, uses a faster predefined approach for typical Illumina
+        read names
+        Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)"
+    Returns
+    -------
+    Series
+        Series containing tile IDs as strings
+    """
+    if regex:
+        split = series.str.extractall(regex).unstack().droplevel(1, axis=1)
+        return split[0] + ":" + split[1] + ":" + split[2]
+    else:
+        split = series.str.split(":", expand=True)
+        return split[2] + ":" + split[3] + ":" + split[4]
+
+
+def analyse_duplicate_stats(dups, tile_dup_regex=False):
+    """Count by-tile duplicates
+    Parameters
+    ----------
+    dups : pd.DataFrame
+        Dataframe with duplicates that contains pared read IDs
+    tile_dup_regex : bool, optional
+        See extract_tile_info for details, by default False
+    Returns
+    -------
+    pd.DataFrame
+        Grouped multi-indexed dataframe of pairwise by-tile duplication counts
+    """
+    dups = dups.copy()
+    dups["tile"] = extract_tile_info(dups["readID"])
+    dups["parent_tile"] = extract_tile_info(dups["parent_readID"])
+    dups["same_tile"] = dups["tile"] == dups["parent_tile"]
+    bytile_dups = (
+        dups.groupby(["tile", "parent_tile"])
+        .size()
+        .reset_index(name="dup_count")
+        .sort_values(["tile", "parent_tile"])
+    )
+    bytile_dups[["tile", "parent_tile"]] = np.sort(
+        bytile_dups[["tile", "parent_tile"]].values, axis=1
+    )
+    bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum()
+    return bytile_dups
+
+
 class PairCounter(Mapping):
     """
     A Counter for Hi-C pairs that accumulates various statistics.
@@ -180,6 +260,46 @@ def __iter__(self):
     def __len__(self):
         return len(self._stat)
 
+    def calculate_summaries(self):
+        """calculate summary statistics (fraction of cis pairs at different cutoffs,
+        complexity estimate) based on accumulated counts. Results are saved into
+        self._stat['summary']
+        """
+        for cis_count in (
+            "cis",
+            "cis_1kb+",
+            "cis_2kb+",
+            "cis_4kb+",
+            "cis_10kb+",
+            "cis_20kb+",
+            "cis_40kb+",
+        ):
+            self._stat["summary"][f"frac_{cis_count}"] = (
+                (self._stat[cis_count] / self._stat["total_nodups"])
+                if self._stat["total_nodups"] > 0
+                else 0
+            )
+        self._stat["summary"]["frac_dups"] = (
+            (self._stat["total_dups"] / self._stat["total_mapped"])
+            if self._stat["total_mapped"] > 0
+            else 0
+        )
+        self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
+            self._stat["total_mapped"], self._stat["total_dups"], 0
+        )
+        if self.bytile_dups.shape[0] > 0:
+            self._stat["dups_by_tile_median"] = (
+                self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0]
+            )
+        if "dups_by_tile_median" in self._stat:
+            self._stat["summary"][
+                "complexity_dups_by_tile_median"
+            ] = estimate_library_complexity(
+                self._stat["total_mapped"],
+                self._stat["total_dups"],
+                self._stat["dups_by_tile_median"],
+            )
+
     @classmethod
     def from_file(cls, file_handle):
         """create instance of PairCounter from file

From 7cd0c649061fdf028303672111ebbaf4d1a932a4 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 15:33:09 +0200
Subject: [PATCH 16/20] Add missing pieces in stats

---
 pairtools/lib/stats.py | 52 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py
index 5e0e128f..aed95fd5 100644
--- a/pairtools/lib/stats.py
+++ b/pairtools/lib/stats.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 from collections.abc import Mapping
 import sys
 from . import fileio
@@ -180,6 +181,26 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
             "++": np.zeros(len(self._dist_bins), dtype=np.int),
         }
 
+        # Summaries are derived from other stats and are recalculated on merge
+        self._stat["summary"] = dict(
+            [
+                ("frac_cis", 0),
+                ("frac_cis_1kb+", 0),
+                ("frac_cis_2kb+", 0),
+                ("frac_cis_4kb+", 0),
+                ("frac_cis_10kb+", 0),
+                ("frac_cis_20kb+", 0),
+                ("frac_cis_40kb+", 0),
+                ("frac_dups", 0),
+                ("complexity_naive", 0),
+            ]
+        )
+        self.bytile_dups = pd.DataFrame(
+            index=pd.MultiIndex(
+                levels=[[], []], codes=[[], []], names=["tile", "parent_tile"]
+            )
+        )
+
     def __getitem__(self, key):
         if isinstance(key, str):
             # let's strip any unintentional '/'
@@ -465,9 +486,9 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
         else:
             self._stat["total_single_sided_mapped"] += 1
 
-    def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
+    def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False):
         """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter.
-
+    
         Parameters
         ----------
         df: pd.DataFrame
@@ -506,22 +527,25 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
             mask_dups = df_mapped["duplicate"]
         else:
             mask_dups = df_mapped["pair_type"] == "DD"
-        dups_count = mask_dups.sum()
+        dups = df_mapped[mask_dups]
+        dups_count = dups.shape[0]
         self._stat["total_dups"] += int(dups_count)
         self._stat["total_nodups"] += int(mapped_count - dups_count)
 
+        df_nodups = df_mapped.loc[~mask_dups, :]
+        mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"]
+        df_cis = df_nodups.loc[mask_cis, :].copy()
+
         # Count pairs per chromosome:
         for (chrom1, chrom2), chrom_count in (
-            df_mapped[["chrom1", "chrom2"]].value_counts().items()
+            df_nodups[["chrom1", "chrom2"]].value_counts().items()
         ):
             self._stat["chrom_freq"][(chrom1, chrom2)] = (
                 self._stat["chrom_freq"].get((chrom1, chrom2), 0) + chrom_count
             )
 
         # Count cis-trans by pairs:
-        df_nodups = df_mapped.loc[~mask_dups, :]
-        mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"]
-        df_cis = df_nodups.loc[mask_cis, :].copy()
+
         self._stat["cis"] += df_cis.shape[0]
         self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0]
         dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values)
@@ -538,6 +562,12 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
         self._stat["cis_20kb+"] += int(np.sum(dist >= 20000))
         self._stat["cis_40kb+"] += int(np.sum(dist >= 40000))
 
+        ### Add by-tile dups
+        if bytile_dups and dups.shape[0] > 0:
+            self.bytile_dups = self.bytile_dups.add(
+                analyse_duplicate_stats(dups), fill_value=0
+            ).astype(int)
+
     def add_chromsizes(self, chromsizes):
         """ Add chromsizes field to the output stats
 
@@ -728,3 +758,11 @@ def save(self, outstream, yaml=False):
         else:
             for k, v in self.flatten().items():
                 outstream.write("{}{}{}\n".format(k, self._SEP, v))
+
+    def save_bytile_dups(self, outstream):
+        """save bytile duplication counts to a tab-delimited text file.
+        Parameters
+        ----------
+        outstream: file handle
+        """
+        self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False)

From 09cfab0ba0805a96dce31f6d4f09085a2c3d98b2 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 15:34:46 +0200
Subject: [PATCH 17/20] Fix missing imports

---
 pairtools/lib/stats.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py
index aed95fd5..97badf09 100644
--- a/pairtools/lib/stats.py
+++ b/pairtools/lib/stats.py
@@ -1,5 +1,8 @@
+import warnings
+
 import numpy as np
 import pandas as pd
+from scipy import special
 from collections.abc import Mapping
 import sys
 from . import fileio

From 6154e286648f271e2e928cecadb463a2ae6287cd Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 27 Apr 2022 16:47:51 +0200
Subject: [PATCH 18/20] FIx order in stats test

---
 pairtools/lib/stats.py       | 13 ++++++++-----
 tests/data/mock.4stats.pairs | 10 +++++-----
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py
index 97badf09..653d2673 100644
--- a/pairtools/lib/stats.py
+++ b/pairtools/lib/stats.py
@@ -289,6 +289,13 @@ def calculate_summaries(self):
         complexity estimate) based on accumulated counts. Results are saved into
         self._stat['summary']
         """
+
+        self._stat["summary"]["frac_dups"] = (
+            (self._stat["total_dups"] / self._stat["total_mapped"])
+            if self._stat["total_mapped"] > 0
+            else 0
+        )
+
         for cis_count in (
             "cis",
             "cis_1kb+",
@@ -303,11 +310,7 @@ def calculate_summaries(self):
                 if self._stat["total_nodups"] > 0
                 else 0
             )
-        self._stat["summary"]["frac_dups"] = (
-            (self._stat["total_dups"] / self._stat["total_mapped"])
-            if self._stat["total_mapped"] > 0
-            else 0
-        )
+
         self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
             self._stat["total_mapped"], self._stat["total_dups"], 0
         )
diff --git a/tests/data/mock.4stats.pairs b/tests/data/mock.4stats.pairs
index 4d6bd7fa..73c46673 100644
--- a/tests/data/mock.4stats.pairs
+++ b/tests/data/mock.4stats.pairs
@@ -10,11 +10,11 @@
 #chromsize: chr3 100
 #chromsize: chr1 100
 #columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type
-readid01	chr1	1	chr2	20	+	+	UU
-readid02	chr1	1	chr1	50	+	+	UU
-readid03	chr1	1	chr1	50	+	+	DD
-readid04	chr1	1	chr1	2	+	+	UU
-readid05	chr1	1	chr1	3	+	+	UR
+readid01	chr1	1	chr1	50	+	+	UU
+readid02	chr1	1	chr1	50	+	+	DD
+readid03	chr1	1	chr1	2	+	+	UU
+readid04	chr1	1	chr1	3	+	+	UR
+readid05	chr1	1	chr2	20	+	+	UU
 readid06	chr2	1	chr3	2	+	+	UU
 readid07	!	0	chr1	3	-	+	NU
 readid08	!	0	chr1	3	-	+	MU

From 7face1d3be43ea9b7a26588e4a86c20bab79340e Mon Sep 17 00:00:00 2001
From: Aleksandra Galitsyna <agalitzina@gmail.com>
Date: Wed, 27 Apr 2022 14:09:40 -0400
Subject: [PATCH 19/20] important fixes; tests are back.

---
 pairtools/cli/dedup.py |  42 +++---
 pairtools/cli/stats.py |  43 +++---
 pairtools/lib/dedup.py |   3 +-
 pairtools/lib/stats.py | 311 +++++++++++++++++++++++------------------
 4 files changed, 224 insertions(+), 175 deletions(-)

diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
index 61d68572..74bfe29a 100644
--- a/pairtools/cli/dedup.py
+++ b/pairtools/cli/dedup.py
@@ -5,8 +5,8 @@
 import ast
 import pathlib
 
-# from distutils.log import warn
-# import warnings
+from .._logging import get_logger
+logger = get_logger()
 
 from ..lib import fileio, pairsam_format, headerops
 from . import cli, common_io_options
@@ -61,13 +61,14 @@
     " By default, statistics are not printed.",
 )
 @click.option(
-    "--output-bytile-dups-stats",
+    "--output-bytile-stats",
     type=str,
     default="",
     help="output file for duplicate statistics."
     " If file exists, it will be open in the append mode."
     " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
-    " By default, by-tile duplicate statistics are not printed.",
+    " By default, by-tile duplicate statistics are not printed."
+    " Note that the readID should be provided and contain tile information for this option. ",
 )
 ### Set the dedup method:
 @click.option(
@@ -231,7 +232,7 @@ def dedup(
     output_dups,
     output_unmapped,
     output_stats,
-    output_bytile_dups_stats,
+    output_bytile_stats,
     chunksize,
     carryover,
     max_mismatch,
@@ -269,7 +270,7 @@ def dedup(
         output_dups,
         output_unmapped,
         output_stats,
-        output_bytile_dups_stats,
+        output_bytile_stats,
         chunksize,
         carryover,
         max_mismatch,
@@ -303,7 +304,7 @@ def dedup_py(
     output_dups,
     output_unmapped,
     output_stats,
-    output_bytile_dups_stats,
+    output_bytile_stats,
     chunksize,
     carryover,
     max_mismatch,
@@ -361,18 +362,21 @@ def dedup_py(
         else None
     )
 
-    out_bytile_dups_stats_stream = (
-        fileio.auto_open(
-            output_bytile_dups_stats,
-            mode="w",
-            nproc=kwargs.get("nproc_out"),
-            command=kwargs.get("cmd_out", None),
+    bytile_dups = False
+    if output_bytile_stats:
+        out_bytile_stats_stream = fileio.auto_open(
+                output_bytile_stats,
+                mode="w",
+                nproc=kwargs.get("nproc_out"),
+                command=kwargs.get("cmd_out", None),
         )
-        if output_bytile_dups_stats
-        else None
-    )
+        bytile_dups = True
+        if not keep_parent_id:
+            logger.warning("Force output --parent-readID because --output-bytile-stats provided.")
+            keep_parent_id = True
+
     # generate empty PairCounter if stats output is requested:
-    out_stat = PairCounter() if output_stats else None
+    out_stat = PairCounter(bytile_dups=bytile_dups) if output_stats else None
 
     if not output_dups:
         outstream_dups = None
@@ -486,8 +490,8 @@ def dedup_py(
     if out_stat:
         out_stat.save(out_stats_stream)
 
-    if output_bytile_dups_stats:
-        out_stat.save_bytile_dups(out_bytile_dups_stats_stream)
+    if bytile_dups:
+        out_stat.save_bytile_dups(out_bytile_stats_stream)
 
     if instream != sys.stdin:
         instream.close()
diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py
index a8888687..ff7b5904 100644
--- a/pairtools/cli/stats.py
+++ b/pairtools/cli/stats.py
@@ -3,7 +3,6 @@
 import io
 import sys
 import click
-import warnings
 
 import pandas as pd
 from ..lib import fileio, pairsam_format, headerops
@@ -11,6 +10,9 @@
 
 from ..lib.stats import PairCounter, do_merge
 
+from .._logging import get_logger
+logger = get_logger()
+
 UTIL_NAME = "pairtools_stats"
 
 
@@ -45,18 +47,21 @@
     default=False,
     help="If enabled, will analyse by-tile duplication statistics to estimate"
     " library complexity more accurately."
-    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
+    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)"
+    " Saves by-tile stats into --output_bytile-stats stream, or regular output if --output_bytile-stats is not provided.",
 )
 @click.option(
-    "--output-bytile-dups-stats",
-    type=str,
+    "--output-bytile-stats",
     default="",
-    help="If specified, will analyse by-tile duplication statistics to estimate"
-    " library complexity more accurately and will save details to this path."
-    " Requires parent_readID column to be saved by dedup (will be ignored otherwise)",
+    required=False,
+    help="output file for tile duplicate statistics."
+    " If file exists, it will be open in the append mode."
+    " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
+    " By default, by-tile duplicate statistics are not printed."
+    " Note that the readID and parent_readID should be provided and contain tile information for this option.",
 )
 @common_io_options
-def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs):
+def stats(input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs):
     """Calculate pairs statistics.
 
     INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
@@ -68,12 +73,12 @@ def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kw
     """
 
     stats_py(
-        input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs,
+        input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs,
     )
 
 
 def stats_py(
-    input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs
+    input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs
 ):
     if merge:
         do_merge(output, input_path, **kwargs)
@@ -94,18 +99,23 @@ def stats_py(
         nproc=kwargs.get("nproc_out"),
         command=kwargs.get("cmd_out", None),
     )
+    if bytile_dups and not output_bytile_stats:
+        output_bytile_stats = outstream
+    if output_bytile_stats:
+        bytile_dups = True
 
     header, body_stream = headerops.get_header(instream)
     cols = headerops.extract_column_names(header)
 
-    if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols:
-        warnings.warn(
+    # Check necessary columns for reporting bytile stats:
+    if bytile_dups and "parent_readID" not in cols:
+        logger.warning(
             "No 'parent_readID' column in the file, not generating duplicate stats."
         )
         bytile_dups = False
-        output_bytile_dups_stats = False
+
     # new stats class stuff would come here ...
-    stats = PairCounter()
+    stats = PairCounter(bytile_dups=bytile_dups)
 
     # Collecting statistics
     for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):
@@ -115,8 +125,9 @@ def stats_py(
         chromsizes = headerops.extract_chromsizes(header)
         stats.add_chromsizes(chromsizes)
 
-    if output_bytile_dups_stats:
-        stats.save_bytile_dups()
+    if bytile_dups:
+        stats.save_bytile_dups(output_bytile_stats)
+
     stats.calculate_summaries()
 
     # save statistics to file ...
diff --git a/pairtools/lib/dedup.py b/pairtools/lib/dedup.py
index 29cccbbd..a7aac0a9 100644
--- a/pairtools/lib/dedup.py
+++ b/pairtools/lib/dedup.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-import warnings
 
 import scipy.spatial
 from scipy.sparse import coo_matrix
@@ -380,7 +379,7 @@ def streaming_dedup_cython(
 
         # take care of empty lines not at the end of the file separately
         if rawline and (not stripline):
-            warnings.warn("Empty line detected not at the end of the file")
+            logger.warning("Empty line detected not at the end of the file")
             continue
 
         if stripline:
diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py
index 653d2673..9e822f43 100644
--- a/pairtools/lib/stats.py
+++ b/pairtools/lib/stats.py
@@ -1,5 +1,3 @@
-import warnings
-
 import numpy as np
 import pandas as pd
 from scipy import special
@@ -7,118 +5,8 @@
 import sys
 from . import fileio
 
-
-def do_merge(output, files_to_merge, **kwargs):
-    # Parse all stats files.
-    stats = []
-    for stat_file in files_to_merge:
-        f = fileio.auto_open(
-            stat_file,
-            mode="r",
-            nproc=kwargs.get("nproc_in"),
-            command=kwargs.get("cmd_in", None),
-        )
-        # use a factory method to instanciate PairCounter
-        stat = PairCounter.from_file(f)
-        stats.append(stat)
-        f.close()
-
-    # combine stats from several files (files_to_merge):
-    out_stat = sum(stats)
-
-    # Save merged stats.
-    outstream = fileio.auto_open(
-        output,
-        mode="w",
-        nproc=kwargs.get("nproc_out"),
-        command=kwargs.get("cmd_out", None),
-    )
-
-    # save statistics to file ...
-    out_stat.save(outstream)
-
-    if outstream != sys.stdout:
-        outstream.close()
-
-
-def estimate_library_complexity(nseq, ndup, nopticaldup=0):
-    """Estimate library complexity accounting for optical/clustering duplicates
-    Parameters
-    ----------
-    nseq : int
-        Total number of sequences
-    ndup : int
-        Total number of duplicates
-    nopticaldup : int, optional
-        Number of non-PCR duplicates, by default 0
-    Returns
-    -------
-    float
-        Estimated complexity
-    """
-    nseq = nseq - nopticaldup
-    if nseq == 0:
-        warnings.warn("Empty of fully duplicated library, can't estimate complexity")
-        return 0
-    ndup = ndup - nopticaldup
-    u = (nseq - ndup) / nseq
-    seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u
-    complexity = nseq / seq_to_complexity
-    return complexity
-
-
-def extract_tile_info(series, regex=False):
-    """Extract the name of the tile for each read name in the series
-    Parameters
-    ----------
-    series : pd.Series
-        Series containing read IDs
-    regex : bool, optional
-        Regex to extract fields from the read IDs that correspond to tile IDs.
-        By default False, uses a faster predefined approach for typical Illumina
-        read names
-        Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)"
-    Returns
-    -------
-    Series
-        Series containing tile IDs as strings
-    """
-    if regex:
-        split = series.str.extractall(regex).unstack().droplevel(1, axis=1)
-        return split[0] + ":" + split[1] + ":" + split[2]
-    else:
-        split = series.str.split(":", expand=True)
-        return split[2] + ":" + split[3] + ":" + split[4]
-
-
-def analyse_duplicate_stats(dups, tile_dup_regex=False):
-    """Count by-tile duplicates
-    Parameters
-    ----------
-    dups : pd.DataFrame
-        Dataframe with duplicates that contains pared read IDs
-    tile_dup_regex : bool, optional
-        See extract_tile_info for details, by default False
-    Returns
-    -------
-    pd.DataFrame
-        Grouped multi-indexed dataframe of pairwise by-tile duplication counts
-    """
-    dups = dups.copy()
-    dups["tile"] = extract_tile_info(dups["readID"])
-    dups["parent_tile"] = extract_tile_info(dups["parent_readID"])
-    dups["same_tile"] = dups["tile"] == dups["parent_tile"]
-    bytile_dups = (
-        dups.groupby(["tile", "parent_tile"])
-        .size()
-        .reset_index(name="dup_count")
-        .sort_values(["tile", "parent_tile"])
-    )
-    bytile_dups[["tile", "parent_tile"]] = np.sort(
-        bytile_dups[["tile", "parent_tile"]].values, axis=1
-    )
-    bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum()
-    return bytile_dups
+from .._logging import get_logger
+logger = get_logger()
 
 
 class PairCounter(Mapping):
@@ -137,7 +25,7 @@ class PairCounter(Mapping):
     _SEP = "\t"
     _KEY_SEP = "/"
 
-    def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25):
+    def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25, bytile_dups=False):
         self._stat = {}
         # some variables used for initialization:
         # genomic distance bining for the ++/--/-+/+- distribution
@@ -198,11 +86,13 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
                 ("complexity_naive", 0),
             ]
         )
-        self.bytile_dups = pd.DataFrame(
-            index=pd.MultiIndex(
-                levels=[[], []], codes=[[], []], names=["tile", "parent_tile"]
+        self._save_bytile_dups = bytile_dups
+        if self._save_bytile_dups:
+            self._bytile_dups = pd.DataFrame(
+                index=pd.MultiIndex(
+                    levels=[[], []], codes=[[], []], names=["tile", "parent_tile"]
+                )
             )
-        )
 
     def __getitem__(self, key):
         if isinstance(key, str):
@@ -314,18 +204,21 @@ def calculate_summaries(self):
         self._stat["summary"]["complexity_naive"] = estimate_library_complexity(
             self._stat["total_mapped"], self._stat["total_dups"], 0
         )
-        if self.bytile_dups.shape[0] > 0:
-            self._stat["dups_by_tile_median"] = (
-                self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0]
-            )
-        if "dups_by_tile_median" in self._stat:
-            self._stat["summary"][
-                "complexity_dups_by_tile_median"
-            ] = estimate_library_complexity(
-                self._stat["total_mapped"],
-                self._stat["total_dups"],
-                self._stat["dups_by_tile_median"],
-            )
+
+        if self._save_bytile_dups:
+            # Estimate library complexity with information by tile, if provided:
+            if self._bytile_dups.shape[0] > 0:
+                self._stat["dups_by_tile_median"] = (
+                    self._bytile_dups["dup_count"].median() * self._bytile_dups.shape[0]
+                )
+            if "dups_by_tile_median" in self._stat:
+                self._stat["summary"][
+                    "complexity_dups_by_tile_median"
+                ] = estimate_library_complexity(
+                    self._stat["total_mapped"],
+                    self._stat["total_dups"],
+                    self._stat["dups_by_tile_median"],
+                )
 
     @classmethod
     def from_file(cls, file_handle):
@@ -492,7 +385,7 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type):
         else:
             self._stat["total_single_sided_mapped"] += 1
 
-    def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False):
+    def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
         """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter.
     
         Parameters
@@ -533,8 +426,8 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False):
             mask_dups = df_mapped["duplicate"]
         else:
             mask_dups = df_mapped["pair_type"] == "DD"
-        dups = df_mapped[mask_dups]
-        dups_count = dups.shape[0]
+        df_dups = df_mapped[mask_dups]
+        dups_count = df_dups.shape[0]
         self._stat["total_dups"] += int(dups_count)
         self._stat["total_nodups"] += int(mapped_count - dups_count)
 
@@ -569,9 +462,10 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False):
         self._stat["cis_40kb+"] += int(np.sum(dist >= 40000))
 
         ### Add by-tile dups
-        if bytile_dups and dups.shape[0] > 0:
-            self.bytile_dups = self.bytile_dups.add(
-                analyse_duplicate_stats(dups), fill_value=0
+        if self._save_bytile_dups and (df_dups.shape[0] > 0):
+            bytile_dups = analyse_bytile_duplicate_stats(df_dups)
+            self._bytile_dups = self._bytile_dups.add(
+                bytile_dups, fill_value=0
             ).astype(int)
 
     def add_chromsizes(self, chromsizes):
@@ -681,6 +575,13 @@ def flatten(self):
                         )
                         # store key,value pair:
                         flat_stat[formatted_key] = freq
+                elif (k == "summary") and v:
+                    for key, frac in v.items():
+                        formatted_key = self._KEY_SEP.join(["{}", "{}"]).format(
+                            k, key
+                        )
+                        # store key,value pair:
+                        flat_stat[formatted_key] = frac
 
         # return flattened dict
         return flat_stat
@@ -732,6 +633,11 @@ def format(self):
                         ] = freq
                         # store key,value pair:
                     formatted_stat[k] = deepcopy(freqs)
+                elif (k == "summary") and v:
+                    summary_stats = {}
+                    for key, frac in v.items():
+                        summary_stats[key] = frac
+                    formatted_stat[k] = deepcopy(summary_stats)
 
         # return formatted dict
         return formatted_stat
@@ -771,4 +677,133 @@ def save_bytile_dups(self, outstream):
         ----------
         outstream: file handle
         """
-        self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False)
+        if self._save_bytile_dups:
+            self._bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False)
+        else:
+            logger.error("Bytile dups are not calculated, cannot save.")
+
+
+##################
+# Other functions:
+
+def do_merge(output, files_to_merge, **kwargs):
+    # Parse all stats files.
+    stats = []
+    for stat_file in files_to_merge:
+        f = fileio.auto_open(
+            stat_file,
+            mode="r",
+            nproc=kwargs.get("nproc_in"),
+            command=kwargs.get("cmd_in", None),
+        )
+        # use a factory method to instanciate PairCounter
+        stat = PairCounter.from_file(f)
+        stats.append(stat)
+        f.close()
+
+    # combine stats from several files (files_to_merge):
+    out_stat = sum(stats)
+
+    # Save merged stats.
+    outstream = fileio.auto_open(
+        output,
+        mode="w",
+        nproc=kwargs.get("nproc_out"),
+        command=kwargs.get("cmd_out", None),
+    )
+
+    # save statistics to file ...
+    out_stat.save(outstream)
+
+    if outstream != sys.stdout:
+        outstream.close()
+
+
+def estimate_library_complexity(nseq, ndup, nopticaldup=0):
+    """Estimate library complexity accounting for optical/clustering duplicates
+    Parameters
+    ----------
+    nseq : int
+        Total number of sequences
+    ndup : int
+        Total number of duplicates
+    nopticaldup : int, optional
+        Number of non-PCR duplicates, by default 0
+    Returns
+    -------
+    float
+        Estimated complexity
+    """
+    nseq = nseq - nopticaldup
+    if nseq == 0:
+        logger.warning("Empty of fully duplicated library, can't estimate complexity")
+        return 0
+    ndup = ndup - nopticaldup
+    u = (nseq - ndup) / nseq
+    if u==0:
+        logger.warning("All the sequences are duplicates. Do you run complexity estimation on duplicates file?")
+        return 0
+    seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u
+    complexity = float(nseq / seq_to_complexity) # clean np.int64 data type
+    return complexity
+
+
+def analyse_bytile_duplicate_stats(df_dups, tile_dup_regex=False):
+    """Count by-tile duplicates
+    Parameters
+    ----------
+    dups : pd.DataFrame
+        Dataframe with duplicates that contains pared read IDs
+    tile_dup_regex : bool, optional
+        See extract_tile_info for details, by default False
+    Returns
+    -------
+    pd.DataFrame
+        Grouped multi-indexed dataframe of pairwise by-tile duplication counts
+    """
+
+    df_dups = df_dups.copy()
+
+    df_dups["tile"] = extract_tile_info(df_dups["readID"], regex=tile_dup_regex)
+    df_dups["parent_tile"] = extract_tile_info(df_dups["parent_readID"], regex=tile_dup_regex)
+
+    df_dups["same_tile"] = (df_dups["tile"] == df_dups["parent_tile"])
+    bytile_dups = (
+        df_dups.groupby(["tile", "parent_tile"])
+        .size()
+        .reset_index(name="dup_count")
+        .sort_values(["tile", "parent_tile"])
+    )
+    bytile_dups[["tile", "parent_tile"]] = np.sort(
+        bytile_dups[["tile", "parent_tile"]].values, axis=1
+    )
+    bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum()
+    return bytile_dups
+
+
+def extract_tile_info(series, regex=False):
+    """Extract the name of the tile for each read name in the series
+    Parameters
+    ----------
+    series : pd.Series
+        Series containing read IDs
+    regex : bool, optional
+        Regex to extract fields from the read IDs that correspond to tile IDs.
+        By default False, uses a faster predefined approach for typical Illumina
+        read names
+        Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)"
+    Returns
+    -------
+    Series
+        Series containing tile IDs as strings
+    """
+    if regex:
+        split = series.str.extractall(regex).unstack().droplevel(1, axis=1)
+        if split.shape[1]<4:
+            raise ValueError(f"Unable to convert tile names, does your readID have the tile information?\nHint: SRA removes tile information from readID.\nSample of your readIDs:\n{series.head()}")
+        return split[0] + ":" + split[1] + ":" + split[2]
+    else:
+        split = series.str.split(":", expand=True)
+        if split.shape[1]<5:
+            raise ValueError(f"Unable to convert tile names, does your readID have the tile information?\nHint: SRA removes tile information from readID.\nSample of your readIDs:\n{series.head()}")
+        return split[2] + ":" + split[3] + ":" + split[4]

From c762fd5359453820a4fb89854e4d09c4731fc8eb Mon Sep 17 00:00:00 2001
From: Aleksandra Galitsyna <agalitzina@gmail.com>
Date: Wed, 27 Apr 2022 14:43:45 -0400
Subject: [PATCH 20/20] compute_summaries is now part of saving the stats, if
 not done explicitly

---
 pairtools/cli/stats.py | 4 +---
 pairtools/lib/stats.py | 8 +++++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py
index ff7b5904..890988b0 100644
--- a/pairtools/cli/stats.py
+++ b/pairtools/cli/stats.py
@@ -107,7 +107,7 @@ def stats_py(
     header, body_stream = headerops.get_header(instream)
     cols = headerops.extract_column_names(header)
 
-    # Check necessary columns for reporting bytile stats:
+    # Check necessary columns for reporting by-tile stats:
     if bytile_dups and "parent_readID" not in cols:
         logger.warning(
             "No 'parent_readID' column in the file, not generating duplicate stats."
@@ -128,8 +128,6 @@ def stats_py(
     if bytile_dups:
         stats.save_bytile_dups(output_bytile_stats)
 
-    stats.calculate_summaries()
-
     # save statistics to file ...
     stats.save(outstream, yaml=kwargs.get("yaml", False))
 
diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py
index 9e822f43..167ce491 100644
--- a/pairtools/lib/stats.py
+++ b/pairtools/lib/stats.py
@@ -93,6 +93,7 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25,
                     levels=[[], []], codes=[[], []], names=["tile", "parent_tile"]
                 )
             )
+        self._summaries_calculated = False
 
     def __getitem__(self, key):
         if isinstance(key, str):
@@ -211,7 +212,7 @@ def calculate_summaries(self):
                 self._stat["dups_by_tile_median"] = (
                     self._bytile_dups["dup_count"].median() * self._bytile_dups.shape[0]
                 )
-            if "dups_by_tile_median" in self._stat:
+            if "dups_by_tile_median" in self._stat.keys():
                 self._stat["summary"][
                     "complexity_dups_by_tile_median"
                 ] = estimate_library_complexity(
@@ -220,6 +221,8 @@ def calculate_summaries(self):
                     self._stat["dups_by_tile_median"],
                 )
 
+        self._summaries_calculated = True
+
     @classmethod
     def from_file(cls, file_handle):
         """create instance of PairCounter from file
@@ -661,6 +664,9 @@ def save(self, outstream, yaml=False):
         sort(merge(A,merge(B,C))) == sort(merge(merge(A,B),C))
         """
 
+        if not self._summaries_calculated:
+            self.calculate_summaries()
+
         # write flattened version of the PairCounter to outstream
         if yaml:
             import yaml