From 538cd0c685dbe6fcbb85bb7bbf27ac854d16df80 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 15 Jun 2021 12:49:02 +0100 Subject: [PATCH 01/20] Add summaries, also some f-strings and black --- pairtools/pairtools_stats.py | 446 ++++++++++++++++++++--------------- 1 file changed, 262 insertions(+), 184 deletions(-) diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 0b56a380..2064d71a 100644 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # -*- coding: utf-8 -*- import io @@ -11,63 +10,62 @@ from . import _fileio, _pairsam_format, cli, _headerops, common_io_options -UTIL_NAME = 'pairtools_stats' - -@cli.command() - -@click.argument( - 'input_path', - type=str, - nargs=-1, - required=False) +UTIL_NAME = "pairtools_stats" -@click.option( - '-o', "--output", - type=str, - default="", - help='output stats tsv file.') +@cli.command() +@click.argument("input_path", type=str, nargs=-1, required=False) +@click.option("-o", "--output", type=str, default="", help="output stats tsv file.") @click.option( - "--merge", + "--merge", is_flag=True, - help='If specified, merge multiple input stats files instead of calculating' - ' statistics of a .pairs/.pairsam file. Merging is performed via summation of' - ' all overlapping statistics. Non-overlapping statistics are appended to' - ' the end of the file.', - ) - + help="If specified, merge multiple input stats files instead of calculating" + " statistics of a .pairs/.pairsam file. Merging is performed via summation of" + " all overlapping statistics. Non-overlapping statistics are appended to" + " the end of the file.", +) @common_io_options - def stats(input_path, output, merge, **kwargs): - '''Calculate pairs statistics. + """Calculate pairs statistics. INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. If not provided, the input is read from stdin. - If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number + If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number of stats files to merge. - - The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. - ''' + + The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. + """ stats_py(input_path, output, merge, **kwargs) + def stats_py(input_path, output, merge, **kwargs): if merge: do_merge(output, input_path, **kwargs) return - instream = (_fileio.auto_open(input_path[0], mode='r', - nproc=kwargs.get('nproc_in'), - command=kwargs.get('cmd_in', None)) - if input_path else sys.stdin) - outstream = (_fileio.auto_open(output, mode='w', - nproc=kwargs.get('nproc_out'), - command=kwargs.get('cmd_out', None)) - if output else sys.stdout) - + instream = ( + _fileio.auto_open( + input_path[0], + mode="r", + nproc=kwargs.get("nproc_in"), + command=kwargs.get("cmd_in", None), + ) + if input_path + else sys.stdin + ) + outstream = ( + _fileio.auto_open( + output, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + if output + else sys.stdout + ) header, body_stream = _headerops.get_header(instream) - # new stats class stuff would come here ... stats = PairCounter() @@ -86,28 +84,28 @@ def stats_py(input_path, output, merge, **kwargs): # pair type: pair_type = cols[_pairsam_format.COL_PTYPE] - stats.add_pair(chrom1, pos1, strand1, - chrom2, pos2, strand2, - pair_type) - + stats.add_pair(chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type) + stats.calculate_summaries() # save statistics to file ... stats.save(outstream) - - if instream != sys.stdin: instream.close() if outstream != sys.stdout: outstream.close() + def do_merge(output, files_to_merge, **kwargs): # Parse all stats files. stats = [] for stat_file in files_to_merge: - f = _fileio.auto_open(stat_file, mode='r', - nproc=kwargs.get('nproc_in'), - command=kwargs.get('cmd_in', None)) + f = _fileio.auto_open( + stat_file, + mode="r", + nproc=kwargs.get("nproc_in"), + command=kwargs.get("cmd_in", None), + ) # use a factory method to instanciate PairCounter stat = PairCounter.from_file(f) stats.append(stat) @@ -117,17 +115,27 @@ def do_merge(output, files_to_merge, **kwargs): out_stat = sum(stats) # Save merged stats. - outstream = _fileio.auto_open(output, mode='w', - nproc=kwargs.get('nproc_out'), - command=kwargs.get('cmd_out', None)) + outstream = _fileio.auto_open( + output, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) # save statistics to file ... out_stat.save(outstream) - + if outstream != sys.stdout: outstream.close() +def complexity_estimate(total_mapped, total_dups): + try: + return total_mapped ** 2 / total_dups / 2 + except ZeroDivisionError: + return 0 + + class PairCounter(Mapping): """ A Counter for Hi-C pairs that accumulates various statistics. @@ -141,58 +149,76 @@ class PairCounter(Mapping): -- multiple PairCounters can be merged via addition. """ - _SEP = '\t' - _KEY_SEP = '/' + _SEP = "\t" + _KEY_SEP = "/" def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25): self._stat = OrderedDict() # some variables used for initialization: # genomic distance bining for the ++/--/-+/+- distribution - self._dist_bins = (np.r_[0, - np.round(10**np.arange(min_log10_dist, max_log10_dist+0.001, - log10_dist_bin_step)) - .astype(np.int)] - ) + self._dist_bins = np.r_[ + 0, + np.round( + 10 + ** np.arange( + min_log10_dist, max_log10_dist + 0.001, log10_dist_bin_step + ) + ).astype(np.int), + ] # establish structure of an empty _stat: - self._stat['total'] = 0 - self._stat['total_unmapped'] = 0 - self._stat['total_single_sided_mapped'] = 0 + self._stat["total"] = 0 + self._stat["total_unmapped"] = 0 + self._stat["total_single_sided_mapped"] = 0 # total_mapped = total_dups + total_nodups - self._stat['total_mapped'] = 0 - self._stat['total_dups'] = 0 - self._stat['total_nodups'] = 0 + self._stat["total_mapped"] = 0 + self._stat["total_dups"] = 0 + self._stat["total_nodups"] = 0 ######################################## # the rest of stats are based on nodups: ######################################## - self._stat['cis'] = 0 - self._stat['trans'] = 0 - self._stat['pair_types'] = {} + self._stat["cis"] = 0 + self._stat["trans"] = 0 + self._stat["pair_types"] = {} # to be removed: - self._stat['dedup'] = {} - - self._stat['cis_1kb+'] = 0 - self._stat['cis_2kb+'] = 0 - self._stat['cis_4kb+'] = 0 - self._stat['cis_10kb+'] = 0 - self._stat['cis_20kb+'] = 0 - self._stat['cis_40kb+'] = 0 - - self._stat['chrom_freq'] = OrderedDict() - - self._stat['dist_freq'] = OrderedDict([ - ('+-', np.zeros(len(self._dist_bins), dtype=np.int)), - ('-+', np.zeros(len(self._dist_bins), dtype=np.int)), - ('--', np.zeros(len(self._dist_bins), dtype=np.int)), - ('++', np.zeros(len(self._dist_bins), dtype=np.int)), - ]) - + self._stat["dedup"] = {} + + self._stat["cis_1kb+"] = 0 + self._stat["cis_2kb+"] = 0 + self._stat["cis_4kb+"] = 0 + self._stat["cis_10kb+"] = 0 + self._stat["cis_20kb+"] = 0 + self._stat["cis_40kb+"] = 0 + + self._stat["chrom_freq"] = OrderedDict() + + self._stat["dist_freq"] = OrderedDict( + [ + ("+-", np.zeros(len(self._dist_bins), dtype=np.int)), + ("-+", np.zeros(len(self._dist_bins), dtype=np.int)), + ("--", np.zeros(len(self._dist_bins), dtype=np.int)), + ("++", np.zeros(len(self._dist_bins), dtype=np.int)), + ] + ) + # Summaries are derived from other stats and are recalculated on merge + self._stat["summary"] = OrderedDict( + [ + ("frac_cis", 0), + ("frac_cis_1kb+", 0), + ("frac_cis_2kb+", 0), + ("frac_cis_4kb+", 0), + ("frac_cis_10kb+", 0), + ("frac_cis_20kb+", 0), + ("frac_cis_40kb+", 0), + ("complexity", 0), + ] + ) def __getitem__(self, key): if isinstance(key, str): # let's strip any unintentional '/' # from either side of the key - key = key.strip('/') + key = key.strip("/") if self._KEY_SEP in key: # multi-key to access nested elements k_fields = key.split(self._KEY_SEP) @@ -202,32 +228,33 @@ def __getitem__(self, key): return self._stat[key] else: # clearly an error: - raise ValueError( - '{} is not a valid key: must be str'.format(key)) + raise ValueError(f"{key} is not a valid key: must be str") # K_FIELDS: # process multi-key case: # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup'] # get the first 'k' and keep the remainders in 'k_fields' k = k_fields.pop(0) - if k in ['pair_types', 'dedup']: + if k in ["pair_types", "dedup"]: # assert there is only one element in key_fields left: # 'pair_types' and 'dedup' treated the same if len(k_fields) == 1: return self._stat[k][k_fields[0]] else: raise ValueError( - '{} is not a valid key: {} section implies 1 identifier'.format(key, k)) + f"{key} is not a valid key: {k} section implies 1 identifier" + ) - elif k == 'chrom_freq': + elif k == "chrom_freq": # assert remaining key_fields == [chr1, chr2]: if len(k_fields) == 2: return self._stat[k][tuple(k_fields)] else: raise ValueError( - '{} is not a valid key: {} section implies 2 identifiers'.format(key, k)) + f"{key} is not a valid key: {k} section implies 2 identifiers" + ) - elif k == 'dist_freq': + elif k == "dist_freq": # assert that last element of key_fields is the 'directions' # THIS IS DONE FOR CONSISTENCY WITH .stats FILE # SHOULD THAT BE CHANGED IN .stats AND HERE AS WELL? @@ -235,29 +262,53 @@ def __getitem__(self, key): # assert 'dirs' in ['++','--','+-','-+'] dirs = k_fields.pop() # there is only genomic distance range of the bin that's left: - bin_range, = k_fields + (bin_range,) = k_fields # extract left border of the bin "1000000+" or "1500-6000": - dist_bin_left = bin_range.strip('+') if bin_range.endswith('+') \ - else bin_range.split('-')[0] + dist_bin_left = ( + bin_range.strip("+") + if bin_range.endswith("+") + else bin_range.split("-")[0] + ) # get the index of that bin: - bin_idx = np.searchsorted(self._dist_bins, int(dist_bin_left), 'right') - 1 + bin_idx = ( + np.searchsorted(self._dist_bins, int(dist_bin_left), "right") - 1 + ) # store corresponding value: - return self._stat['dist_freq'][dirs][bin_idx] + return self._stat["dist_freq"][dirs][bin_idx] else: raise ValueError( - '{} is not a valid key: {} section implies 2 identifiers'.format(key,k)) + f"{key} is not a valid key: {k} section implies 2 identifiers" + ) else: - raise ValueError( - '{} is not a valid key'.format(k)) - + raise ValueError(f"{k} is not a valid key") def __iter__(self): return iter(self._stat) - def __len__(self): return len(self._stat) + def calculate_summaries(self): + """calculate summary statistics (fraction of cis pairs at different cutoffs, + complexity estimate) based on accumulated counts. Results are saved into + self._stat['summary'] + + """ + for cis_count in ( + "cis", + "cis_1kb+", + "cis_2kb+", + "cis_4kb+", + "cis_10kb+", + "cis_20kb+", + "cis_40kb+", + ): + self._stat["summary"][f"frac_{cis_count}"] = ( + self._stat[cis_count] / self._stat["total_nodups"] + ) + self._stat["summary"]["complexity"] = complexity_estimate( + self._stat["total_mapped"], self._stat["total_dups"] + ) @classmethod def from_file(cls, file_handle): @@ -275,73 +326,93 @@ def from_file(cls, file_handle): # fill in from file - file_handle: stat_from_file = cls() for l in file_handle: - fields = l.strip().split(cls._SEP) + fields = l.strip().split(cls._SEP) if len(fields) == 0: # skip empty lines: continue if len(fields) != 2: # expect two _SEP separated values per line: raise _fileio.ParseError( - '{} is not a valid stats file'.format(file_handle.name)) + "{} is not a valid stats file".format(file_handle.name) + ) # extract key and value, then split the key: - putative_key, putative_val = fields[0], fields[1] + putative_key, putative_val = fields[0], fields[1] key_fields = putative_key.split(cls._KEY_SEP) # we should impose a rigid structure of .stats or redo it: - if len(key_fields)==1: + if len(key_fields) == 1: key = key_fields[0] if key in stat_from_file._stat: stat_from_file._stat[key] = int(fields[1]) else: raise _fileio.ParseError( - '{} is not a valid stats file: unknown field {} detected'.format(file_handle.name,key)) + "{} is not a valid stats file: unknown field {} detected".format( + file_handle.name, key + ) + ) else: # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup'] # get the first 'key' and keep the remainders in 'key_fields' key = key_fields.pop(0) - if key in ['pair_types', 'dedup']: + if key in ["pair_types", "dedup"]: # assert there is only one element in key_fields left: # 'pair_types' and 'dedup' treated the same if len(key_fields) == 1: stat_from_file._stat[key][key_fields[0]] = int(fields[1]) else: raise _fileio.ParseError( - '{} is not a valid stats file: {} section implies 1 identifier'.format(file_handle.name,key)) + "{} is not a valid stats file: {} section implies 1 identifier".format( + file_handle.name, key + ) + ) - elif key == 'chrom_freq': + elif key == "chrom_freq": # assert remaining key_fields == [chr1, chr2]: if len(key_fields) == 2: stat_from_file._stat[key][tuple(key_fields)] = int(fields[1]) else: raise _fileio.ParseError( - '{} is not a valid stats file: {} section implies 2 identifiers'.format(file_handle.name,key)) + "{} is not a valid stats file: {} section implies 2 identifiers".format( + file_handle.name, key + ) + ) - elif key == 'dist_freq': + elif key == "dist_freq": # assert that last element of key_fields is the 'directions' if len(key_fields) == 2: # assert 'dirs' in ['++','--','+-','-+'] dirs = key_fields.pop() # there is only genomic distance range of the bin that's left: - bin_range, = key_fields + (bin_range,) = key_fields # extract left border of the bin "1000000+" or "1500-6000": - dist_bin_left = (bin_range.strip('+') - if bin_range.endswith('+') - else bin_range.split('-')[0]) + dist_bin_left = ( + bin_range.strip("+") + if bin_range.endswith("+") + else bin_range.split("-")[0] + ) # get the index of that bin: - bin_idx = np.searchsorted( - stat_from_file._dist_bins, - int(dist_bin_left), 'right') - 1 + bin_idx = ( + np.searchsorted( + stat_from_file._dist_bins, int(dist_bin_left), "right" + ) + - 1 + ) # store corresponding value: stat_from_file._stat[key][dirs][bin_idx] = int(fields[1]) else: raise _fileio.ParseError( - '{} is not a valid stats file: {} section implies 2 identifiers'.format(file_handle.name,key)) + "{} is not a valid stats file: {} section implies 2 identifiers".format( + file_handle.name, key + ) + ) else: raise _fileio.ParseError( - '{} is not a valid stats file: unknown field {} detected'.format(file_handle.name,key)) + "{} is not a valid stats file: unknown field {} detected".format( + file_handle.name, key + ) + ) # return PairCounter from a non-empty dict: return stat_from_file - def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): """Gather statistics for a Hi-C pair and add to the PairCounter. @@ -363,44 +434,46 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): type of the mapped pair of reads """ - self._stat['total'] += 1 + self._stat["total"] += 1 # collect pair type stats including DD: - self._stat['pair_types'][pair_type] = self._stat['pair_types'].get(pair_type,0) + 1 - if chrom1 == '!' and chrom2 == '!': - self._stat['total_unmapped'] += 1 - elif chrom1 != '!' and chrom2 != '!': - self._stat['total_mapped'] += 1 + self._stat["pair_types"][pair_type] = ( + self._stat["pair_types"].get(pair_type, 0) + 1 + ) + if chrom1 == "!" and chrom2 == "!": + self._stat["total_unmapped"] += 1 + elif chrom1 != "!" and chrom2 != "!": + self._stat["total_mapped"] += 1 # only mapped ones can be duplicates: - if pair_type == 'DD': - self._stat['total_dups'] += 1 + if pair_type == "DD": + self._stat["total_dups"] += 1 else: - self._stat['total_nodups'] += 1 - self._stat['chrom_freq'][(chrom1, chrom2)] = ( - self._stat['chrom_freq'].get((chrom1, chrom2), 0) + 1) + self._stat["total_nodups"] += 1 + self._stat["chrom_freq"][(chrom1, chrom2)] = ( + self._stat["chrom_freq"].get((chrom1, chrom2), 0) + 1 + ) if chrom1 == chrom2: - self._stat['cis'] += 1 - dist = np.abs(pos2-pos1) - bin_idx = np.searchsorted(self._dist_bins, dist, 'right') - 1 - self._stat['dist_freq'][strand1+strand2][bin_idx] += 1 + self._stat["cis"] += 1 + dist = np.abs(pos2 - pos1) + bin_idx = np.searchsorted(self._dist_bins, dist, "right") - 1 + self._stat["dist_freq"][strand1 + strand2][bin_idx] += 1 if dist >= 1000: - self._stat['cis_1kb+'] += 1 + self._stat["cis_1kb+"] += 1 if dist >= 2000: - self._stat['cis_2kb+'] += 1 + self._stat["cis_2kb+"] += 1 if dist >= 4000: - self._stat['cis_4kb+'] += 1 + self._stat["cis_4kb+"] += 1 if dist >= 10000: - self._stat['cis_10kb+'] += 1 + self._stat["cis_10kb+"] += 1 if dist >= 20000: - self._stat['cis_20kb+'] += 1 + self._stat["cis_20kb+"] += 1 if dist >= 40000: - self._stat['cis_40kb+'] += 1 + self._stat["cis_40kb+"] += 1 else: - self._stat['trans'] += 1 + self._stat["trans"] += 1 else: - self._stat['total_single_sided_mapped'] += 1 - + self._stat["total_single_sided_mapped"] += 1 def __add__(self, other): # both PairCounter are implied to have a list of common fields: @@ -412,39 +485,41 @@ def __add__(self, other): # initialize empty PairCounter for the result of summation: sum_stat = PairCounter() # use the empty PairCounter to iterate over: - for k,v in sum_stat._stat.items(): + for k, v in sum_stat._stat.items(): # not nested fields are summed trivially: if isinstance(v, int): sum_stat._stat[k] = self._stat[k] + other._stat[k] # sum nested dicts/arrays in a context dependet manner: else: - if k in ['pair_types','dedup']: + if k in ["pair_types", "dedup"]: # handy function for summation of a pair of dicts: # https://stackoverflow.com/questions/10461531/merge-and-sum-of-two-dictionaries - sum_dicts = lambda dict_x,dict_y: { - key: dict_x.get(key, 0) + dict_y.get(key, 0) - for key in set(dict_x) | set(dict_y) } + sum_dicts = lambda dict_x, dict_y: { + key: dict_x.get(key, 0) + dict_y.get(key, 0) + for key in set(dict_x) | set(dict_y) + } # sum a pair of corresponding dicts: sum_stat._stat[k] = sum_dicts(self._stat[k], other._stat[k]) - if k == 'chrom_freq': + if k == "chrom_freq": # union list of keys (chr1,chr2) with potential duplicates: - union_keys_with_dups = list(self._stat[k].keys()) + list(other._stat[k].keys()) + union_keys_with_dups = list(self._stat[k].keys()) + list( + other._stat[k].keys() + ) # OrderedDict.fromkeys will take care of keys' order and duplicates in a consistent manner: # https://stackoverflow.com/questions/1720421/how-to-concatenate-two-lists-in-python # last comment to the 3rd Answer - sum_stat._stat[k] = OrderedDict.fromkeys( union_keys_with_dups ) + sum_stat._stat[k] = OrderedDict.fromkeys(union_keys_with_dups) # perform a summation: for union_key in sum_stat._stat[k]: - sum_stat._stat[k][union_key] = ( - self._stat[k].get(union_key, 0) - + other._stat[k].get(union_key, 0) - ) - if k == 'dist_freq': + sum_stat._stat[k][union_key] = self._stat[k].get( + union_key, 0 + ) + other._stat[k].get(union_key, 0) + if k == "dist_freq": for dirs in sum_stat[k]: sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs] + self.calculate_summaries() return sum_stat - # we need this to be able to sum(list_of_PairCounters) def __radd__(self, other): if other == 0: @@ -452,50 +527,54 @@ def __radd__(self, other): else: return self.__add__(other) - def flatten(self): - """return a flattened OrderedDic (formatted same way as .stats file) - - """ + """return a flattened OrderedDic (formatted same way as .stats file)""" # OrderedDict for flat store: flat_stat = OrderedDict() # Storing statistics - for k,v in self._stat.items(): + for k, v in self._stat.items(): if isinstance(v, int): flat_stat[k] = v # store nested dicts/arrays in a context dependet manner: # nested categories are stored only if they are non-trivial else: - if (k == 'dist_freq') and v: + if (k == "dist_freq") and v: for i in range(len(self._dist_bins)): for dirs, freqs in v.items(): # last bin is treated differently: "100000+" vs "1200-3000": if i != len(self._dist_bins) - 1: - formatted_key = self._KEY_SEP.join(['{}','{}-{}','{}']).format( - k, self._dist_bins[i], self._dist_bins[i+1], dirs) + formatted_key = self._KEY_SEP.join( + ["{}", "{}-{}", "{}"] + ).format( + k, self._dist_bins[i], self._dist_bins[i + 1], dirs + ) else: - formatted_key = self._KEY_SEP.join(['{}','{}+','{}']).format( - k, self._dist_bins[i], dirs) - #store key,value pair: + formatted_key = self._KEY_SEP.join( + ["{}", "{}+", "{}"] + ).format(k, self._dist_bins[i], dirs) + # store key,value pair: flat_stat[formatted_key] = freqs[i] - elif (k in ['pair_types','dedup']) and v: - # 'pair_types' and 'dedup' are simple dicts inside, + elif (k in ["pair_types", "dedup", "summary"]) and v: + # 'pair_types', 'dedup' and 'summary' are simple dicts inside, # treat them the exact same way: for k_item, freq in v.items(): - formatted_key = self._KEY_SEP.join(['{}','{}']).format(k, k_item) - #store key,value pair: + formatted_key = self._KEY_SEP.join(["{}", "{}"]).format( + k, k_item + ) + # store key,value pair: flat_stat[formatted_key] = freq - elif (k == 'chrom_freq') and v: + elif (k == "chrom_freq") and v: for (chrom1, chrom2), freq in v.items(): - formatted_key = self._KEY_SEP.join(['{}','{}','{}']).format(k, chrom1, chrom2) - #store key,value pair: + formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format( + k, chrom1, chrom2 + ) + # store key,value pair: flat_stat[formatted_key] = freq # return flattened OrderedDict return flat_stat - def save(self, outstream): """save PairCounter to tab-delimited text file. Flattened version of PairCounter is stored in the file. @@ -516,10 +595,9 @@ def save(self, outstream): """ # write flattened version of the PairCounter to outstream - for k,v in self.flatten().items(): - outstream.write('{}{}{}\n'.format(k, self._SEP, v)) + for k, v in self.flatten().items(): + outstream.write("{}{}{}\n".format(k, self._SEP, v)) -if __name__ == '__main__': +if __name__ == "__main__": stats() - From 99678bb0b35f770e4b064a0344a27b0b7867e9a9 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 15 Jun 2021 13:09:01 +0100 Subject: [PATCH 02/20] Fix stats test --- tests/test_stats.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index a9a10674..b6691369 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -27,7 +27,10 @@ def test_mock_pairsam(): if not l.startswith('#') and l.strip()) for k in stats: - stats[k] = int(stats[k]) + try: + stats[k] = int(stats[k]) + except ValueError: + stats[k] = float(stats[k]) print(stats) assert stats['total'] == 8 @@ -53,4 +56,11 @@ def test_mock_pairsam(): assert stats['dist_freq/1-2/++'] == 1 assert stats['dist_freq/2-3/++'] == 1 assert stats['dist_freq/32-56/++'] == 1 + assert stats['summary/frac_cis'] == 0.6 + assert stats['summary/frac_cis_1kb+'] == 0 + assert stats['summary/frac_cis_2kb+'] == 0 + assert stats['summary/frac_cis_4kb+'] == 0 + assert stats['summary/frac_cis_10kb+'] == 0 + assert stats['summary/frac_cis_20kb+'] == 0 + assert stats['summary/frac_cis_40kb+'] == 0 From 6e202cb4b2a137ee3aae676fd232d7f24fbfc1f2 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 15 Jun 2021 13:16:58 +0100 Subject: [PATCH 03/20] Fix merge --- pairtools/pairtools_stats.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 2064d71a..fe0d130e 100644 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -350,14 +350,17 @@ def from_file(cls, file_handle): ) ) else: - # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup'] + # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup', 'summary'] # get the first 'key' and keep the remainders in 'key_fields' key = key_fields.pop(0) - if key in ["pair_types", "dedup"]: + if key in ["pair_types", "dedup", 'summary']: # assert there is only one element in key_fields left: # 'pair_types' and 'dedup' treated the same if len(key_fields) == 1: - stat_from_file._stat[key][key_fields[0]] = int(fields[1]) + try: + stat_from_file._stat[key][key_fields[0]] = int(fields[1]) + except ValueError: + stat_from_file._stat[key][key_fields[0]] = float(fields[1]) else: raise _fileio.ParseError( "{} is not a valid stats file: {} section implies 1 identifier".format( @@ -517,7 +520,7 @@ def __add__(self, other): if k == "dist_freq": for dirs in sum_stat[k]: sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs] - self.calculate_summaries() + sum_stat.calculate_summaries() return sum_stat # we need this to be able to sum(list_of_PairCounters) From 96c2e48da7a4cac29da9ecc68f4e7f2493525b17 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 22 Apr 2022 15:46:50 +0200 Subject: [PATCH 04/20] Add functions for duplication tile and complexity --- pairtools/pairtools_stats.py | 95 +++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 350aa878..7fdc2e7a 100755 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from scipy import special from collections.abc import Mapping @@ -131,11 +132,87 @@ def do_merge(output, files_to_merge, **kwargs): outstream.close() -def complexity_estimate(total_mapped, total_dups): - try: - return total_mapped ** 2 / total_dups / 2 - except ZeroDivisionError: - return 0 +def estimate_library_complexity(nseq, ndup, nopticaldup=0): + """Estimate library complexity accounting for optical/clustering duplicates + + Parameters + ---------- + nseq : int + Total number of sequences + ndup : int + Total number of duplicates + nopticaldup : int, optional + Number of non-PCR duplicates, by default 0 + + Returns + ------- + float + Estimated complexity + """ + nseq = nseq - nopticaldup + ndup = ndup - nopticaldup + u = (nseq - ndup) / nseq + seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u + complexity = nseq / seq_to_complexity + return complexity + + +def extract_tile_info(series, regex=False): + """Extract the name of the tile for each read name in the series + + Parameters + ---------- + series : pd.Series + Series containing read IDs + regex : bool, optional + Regex to extract fields from the read IDs that correspond to tile IDs. + By default False, uses a faster predefined approach for typical Illumina + read names + Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)" + + Returns + ------- + Series + Series containing tile IDs as strings + """ + if regex: + split = series.str.extractall(regex).unstack().droplevel(1, axis=1) + return split[0] + ":" + split[1] + ":" + split[2] + else: + split = series.str.split(":", expand=True) + return split[2] + ":" + split[3] + ":" + split[4] + + +def analyse_duplicate_stats(dups, tile_dup_regex=False): + """_summary_ + + Parameters + ---------- + dups : pd.DataFrame + Dataframe with duplicates that contains pared read IDs + tile_dup_regex : bool, optional + See extract_tile_info for details, by default False + + Returns + ------- + pd.DataFrame + Grouped multi-indexed dataframe of pairwise by-tile duplication counts + """ + dups = dups.copy() + dups["tile"] = extract_tile_info(dups["readID"]) + dups["parent_tile"] = extract_tile_info(dups["parent_readID"]) + dups["same_tile"] = dups["tile"] == dups["parent_tile"] + bytile_dups = ( + dups.groupby(["tile", "parent_tile"]) + .size() + .reset_index(name="dup_count") + .sort_values(["tile", "parent_tile"]) + ) + bytile_dups[["tile", "parent_tile"]] = np.sort( + bytile_dups[["tile", "parent_tile"]].values, axis=1 + ) + bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum() + return bytile_dups class PairCounter(Mapping): @@ -210,7 +287,7 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25) ("frac_cis_10kb+", 0), ("frac_cis_20kb+", 0), ("frac_cis_40kb+", 0), - ("complexity", 0), + ("naive_complexity", 0), ] ) @@ -306,7 +383,7 @@ def calculate_summaries(self): self._stat["summary"][f"frac_{cis_count}"] = ( self._stat[cis_count] / self._stat["total_nodups"] ) - self._stat["summary"]["complexity"] = complexity_estimate( + self._stat["summary"]["naive_complexity"] = estimate_library_complexity( self._stat["total_mapped"], self._stat["total_dups"] ) @@ -594,6 +671,10 @@ def __add__(self, other): for dirs in sum_stat[k]: sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs] sum_stat.calculate_summaries() + sum_stat["summary"]["naive_complexity"] = ( + self._stat["summary"]["naive_complexity"] + + other._stat["summary"]["naive_complexity"] + ) return sum_stat # we need this to be able to sum(list_of_PairCounters) From c05e438354426ed927cc3b9874f2839fab1bc5a1 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 22 Apr 2022 16:16:38 +0200 Subject: [PATCH 05/20] Towards duplicate statistics --- pairtools/pairtools_stats.py | 63 ++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 7fdc2e7a..060d81dd 100755 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -3,6 +3,7 @@ import io import sys import click +import warnings import numpy as np import pandas as pd @@ -26,8 +27,15 @@ " all overlapping statistics. Non-overlapping statistics are appended to" " the end of the file.", ) +@click.option( + "--by-tile-dups", + is_flag=True, + help="If specified, will analyse by-tile duplication statistics to estimate" + " library complexity more accurately. Requires parent_readID column to be saved" + " by dedup (will ignore this option otherwise)", +) @common_io_options -def stats(input_path, output, merge, **kwargs): +def stats(input_path, output, merge, analyse_by_tile_duplication, **kwargs): """Calculate pairs statistics. INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. @@ -37,10 +45,10 @@ def stats(input_path, output, merge, **kwargs): The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. """ - stats_py(input_path, output, merge, **kwargs) + stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs) -def stats_py(input_path, output, merge, **kwargs): +def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs): if merge: do_merge(output, input_path, **kwargs) return @@ -69,25 +77,24 @@ def stats_py(input_path, output, merge, **kwargs): header, body_stream = _headerops.get_header(instream) cols = _headerops.extract_column_names(header) + if analyse_by_tile_duplication and "parent_readID" not in cols: + warnings.warn( + "No 'parent_readID' column in the file, not generating duplicate" " stats." + ) + analyse_by_tile_duplication = False + elif analyse_by_tile_duplication: + bytile_dups = pd.DataFrame([]) # new stats class stuff would come here ... stats = PairCounter() # Collecting statistics - for line in body_stream: - cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP) - - # algn1: - chrom1 = cols[_pairsam_format.COL_C1] - pos1 = int(cols[_pairsam_format.COL_P1]) - strand1 = cols[_pairsam_format.COL_S1] - # algn2: - chrom2 = cols[_pairsam_format.COL_C2] - pos2 = int(cols[_pairsam_format.COL_P2]) - strand2 = cols[_pairsam_format.COL_S2] - # pair type: - pair_type = cols[_pairsam_format.COL_PTYPE] - - stats.add_pair(chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type) + for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000): + stats.add_pairs_from_dataframe(chunk) + if analyse_by_tile_duplication: + dups = chunk.iloc[chunk["duplicate"]] + bytile_dups.add(analyse_duplicate_stats(dups), fill_value=0).astype(int) + dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0] + stats._stat["total_dups_by_tile_median"] = dups_by_tile_median stats.calculate_summaries() # save statistics to file ... @@ -383,9 +390,17 @@ def calculate_summaries(self): self._stat["summary"][f"frac_{cis_count}"] = ( self._stat[cis_count] / self._stat["total_nodups"] ) - self._stat["summary"]["naive_complexity"] = estimate_library_complexity( - self._stat["total_mapped"], self._stat["total_dups"] + self._stat["summary"]["complexity_naive"] = estimate_library_complexity( + self._stat["total_mapped"], self._stat["total_dups"], 0 ) + if "dups_by_tile_median" in self._stat: + self._stat["summary"][ + "complexity_dups_by_tile_median" + ] = estimate_library_complexity( + self._stat["total_mapped"], + self._stat["total_dups"], + self._stat["dups_by_tile_median"], + ) @classmethod def from_file(cls, file_handle): @@ -675,6 +690,14 @@ def __add__(self, other): self._stat["summary"]["naive_complexity"] + other._stat["summary"]["naive_complexity"] ) + if ( + "complexity_dups_by_tile_median" in self._stat + and "complexity_dups_by_tile_median" in other._stat + ): + sum_stat["summary"]["complexity_dups_by_tile_median"] = ( + self._stat["summary"]["complexity_dups_by_tile_median"] + + other._stat["summary"]["complexity_dups_by_tile_median"] + ) return sum_stat # we need this to be able to sum(list_of_PairCounters) From a857a1bbadd622323326f11f62ad42267d09b073 Mon Sep 17 00:00:00 2001 From: Phlya Date: Mon, 25 Apr 2022 14:12:01 +0200 Subject: [PATCH 06/20] Fix by-chrom stats to ignore dups --- pairtools/pairtools_stats.py | 31 ++++++++------ tests/data/mock.stats.pairs | 21 +++++++++ tests/test_stats.py | 82 ++++++++++++++++++------------------ 3 files changed, 82 insertions(+), 52 deletions(-) create mode 100644 tests/data/mock.stats.pairs diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 060d81dd..5d467c11 100755 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -28,7 +28,7 @@ " the end of the file.", ) @click.option( - "--by-tile-dups", + "--analyse-by-tile-duplication", is_flag=True, help="If specified, will analyse by-tile duplication statistics to estimate" " library complexity more accurately. Requires parent_readID column to be saved" @@ -93,8 +93,9 @@ def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs): if analyse_by_tile_duplication: dups = chunk.iloc[chunk["duplicate"]] bytile_dups.add(analyse_duplicate_stats(dups), fill_value=0).astype(int) - dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0] - stats._stat["total_dups_by_tile_median"] = dups_by_tile_median + if analyse_by_tile_duplication: + dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0] + stats._stat["total_dups_by_tile_median"] = dups_by_tile_median stats.calculate_summaries() # save statistics to file ... @@ -191,7 +192,7 @@ def extract_tile_info(series, regex=False): def analyse_duplicate_stats(dups, tile_dup_regex=False): - """_summary_ + """Count by-tile duplicates Parameters ---------- @@ -294,7 +295,8 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25) ("frac_cis_10kb+", 0), ("frac_cis_20kb+", 0), ("frac_cis_40kb+", 0), - ("naive_complexity", 0), + ("frac_dups", 0), + ("complexity_naive", 0), ] ) @@ -390,6 +392,9 @@ def calculate_summaries(self): self._stat["summary"][f"frac_{cis_count}"] = ( self._stat[cis_count] / self._stat["total_nodups"] ) + self._stat["summary"]["frac_dups"] = ( + self._stat["total_dups"] / self._stat["total_mapped"] + ) self._stat["summary"]["complexity_naive"] = estimate_library_complexity( self._stat["total_mapped"], self._stat["total_dups"], 0 ) @@ -615,18 +620,20 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): self._stat["total_dups"] += int(dups_count) self._stat["total_nodups"] += int(mapped_count - dups_count) + df_nodups = df_mapped.loc[~mask_dups, :] + mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"] + df_cis = df_nodups.loc[mask_cis, :].copy() + # Count pairs per chromosome: for (chrom1, chrom2), chrom_count in ( - df_mapped[["chrom1", "chrom2"]].value_counts().items() + df_nodups[["chrom1", "chrom2"]].value_counts().items() ): self._stat["chrom_freq"][(chrom1, chrom2)] = ( self._stat["chrom_freq"].get((chrom1, chrom2), 0) + chrom_count ) # Count cis-trans by pairs: - df_nodups = df_mapped.loc[~mask_dups, :] - mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"] - df_cis = df_nodups.loc[mask_cis, :].copy() + self._stat["cis"] += df_cis.shape[0] self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0] dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values) @@ -686,9 +693,9 @@ def __add__(self, other): for dirs in sum_stat[k]: sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs] sum_stat.calculate_summaries() - sum_stat["summary"]["naive_complexity"] = ( - self._stat["summary"]["naive_complexity"] - + other._stat["summary"]["naive_complexity"] + sum_stat["summary"]["complexity_naive"] = ( + self._stat["summary"]["complexity_naive"] + + other._stat["summary"]["complexity_naive"] ) if ( "complexity_dups_by_tile_median" in self._stat diff --git a/tests/data/mock.stats.pairs b/tests/data/mock.stats.pairs new file mode 100644 index 00000000..4d6bd7fa --- /dev/null +++ b/tests/data/mock.stats.pairs @@ -0,0 +1,21 @@ +## pairs format v1.0.0 +#shape: upper triangle +#genome_assembly: unknown +#samheader: @SQ SN:chr1 LN:100 +#samheader: @SQ SN:chr2 LN:100 +#samheader: @SQ SN:chr3 LN:100 +#samheader: @PG ID:bwa PN:bwa VN:0.7.15-r1140 CL:bwa mem -SP /path/ucsc.hg19.fasta.gz /path/1.fastq.gz /path/2.fastq.gz +#chromosomes: chr2 chr3 chr1 +#chromsize: chr2 100 +#chromsize: chr3 100 +#chromsize: chr1 100 +#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type +readid01 chr1 1 chr2 20 + + UU +readid02 chr1 1 chr1 50 + + UU +readid03 chr1 1 chr1 50 + + DD +readid04 chr1 1 chr1 2 + + UU +readid05 chr1 1 chr1 3 + + UR +readid06 chr2 1 chr3 2 + + UU +readid07 ! 0 chr1 3 - + NU +readid08 ! 0 chr1 3 - + MU +readid09 ! 0 ! 0 - - WW diff --git a/tests/test_stats.py b/tests/test_stats.py index b6691369..2f906569 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -3,28 +3,27 @@ import sys import subprocess from nose.tools import assert_raises +import numpy as np testdir = os.path.dirname(os.path.realpath(__file__)) + def test_mock_pairsam(): - mock_pairsam_path = os.path.join(testdir, 'data', 'mock.pairsam') + mock_pairsam_path = os.path.join(testdir, "data", "mock.stats.pairs") try: result = subprocess.check_output( - ['python', - '-m', - 'pairtools', - 'stats', - mock_pairsam_path], - ).decode('ascii') + ["python", "-m", "pairtools", "stats", mock_pairsam_path], + ).decode("ascii") except subprocess.CalledProcessError as e: print(e.output) print(sys.exc_info()) raise e - - stats = dict(l.strip().split('\t') - for l in result.split('\n') - if not l.startswith('#') and l.strip()) + stats = dict( + l.strip().split("\t") + for l in result.split("\n") + if not l.startswith("#") and l.strip() + ) for k in stats: try: @@ -33,34 +32,37 @@ def test_mock_pairsam(): stats[k] = float(stats[k]) print(stats) - assert stats['total'] == 8 - assert stats['total_single_sided_mapped'] == 2 - assert stats['total_mapped'] == 5 - assert stats['cis'] == 3 - assert stats['trans'] == 2 - assert stats['pair_types/UU'] == 4 - assert stats['pair_types/NU'] == 1 - assert stats['pair_types/WW'] == 1 - assert stats['pair_types/UR'] == 1 - assert stats['pair_types/MU'] == 1 - assert stats['chrom_freq/chr1/chr2'] == 1 - assert stats['chrom_freq/chr1/chr1'] == 3 - assert stats['chrom_freq/chr2/chr3'] == 1 - assert all(stats[k]==0 - for k in stats - if k.startswith('dist_freq') - and k not in ['dist_freq/1-2/++', - 'dist_freq/2-3/++', - 'dist_freq/32-56/++']) + assert stats["total"] == 9 + assert stats["total_single_sided_mapped"] == 2 + assert stats["total_mapped"] == 6 + assert stats["total_dups"] == 1 + assert stats["cis"] == 3 + assert stats["trans"] == 2 + assert stats["pair_types/UU"] == 4 + assert stats["pair_types/NU"] == 1 + assert stats["pair_types/WW"] == 1 + assert stats["pair_types/UR"] == 1 + assert stats["pair_types/MU"] == 1 + assert stats["pair_types/DD"] == 1 + assert stats["chrom_freq/chr1/chr2"] == 1 + assert stats["chrom_freq/chr1/chr1"] == 3 + assert stats["chrom_freq/chr2/chr3"] == 1 + assert all( + stats[k] == 0 + for k in stats + if k.startswith("dist_freq") + and k not in ["dist_freq/1-2/++", "dist_freq/2-3/++", "dist_freq/32-56/++"] + ) - assert stats['dist_freq/1-2/++'] == 1 - assert stats['dist_freq/2-3/++'] == 1 - assert stats['dist_freq/32-56/++'] == 1 - assert stats['summary/frac_cis'] == 0.6 - assert stats['summary/frac_cis_1kb+'] == 0 - assert stats['summary/frac_cis_2kb+'] == 0 - assert stats['summary/frac_cis_4kb+'] == 0 - assert stats['summary/frac_cis_10kb+'] == 0 - assert stats['summary/frac_cis_20kb+'] == 0 - assert stats['summary/frac_cis_40kb+'] == 0 + assert stats["dist_freq/1-2/++"] == 1 + assert stats["dist_freq/2-3/++"] == 1 + assert stats["dist_freq/32-56/++"] == 1 + assert stats["summary/frac_cis"] == 0.6 + assert stats["summary/frac_cis_1kb+"] == 0 + assert stats["summary/frac_cis_2kb+"] == 0 + assert stats["summary/frac_cis_4kb+"] == 0 + assert stats["summary/frac_cis_10kb+"] == 0 + assert stats["summary/frac_cis_20kb+"] == 0 + assert stats["summary/frac_cis_40kb+"] == 0 + assert np.isclose(stats["summary/frac_dups"], 1 / 6) From 30014383499dc4dc0d3e0e4516a98b7ec155fee0 Mon Sep 17 00:00:00 2001 From: Phlya Date: Mon, 25 Apr 2022 14:13:50 +0200 Subject: [PATCH 07/20] Rename mock4stats --- tests/data/{mock.stats.pairs => mock.4stats.pairs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/data/{mock.stats.pairs => mock.4stats.pairs} (100%) diff --git a/tests/data/mock.stats.pairs b/tests/data/mock.4stats.pairs similarity index 100% rename from tests/data/mock.stats.pairs rename to tests/data/mock.4stats.pairs From 1a614e7dac797f2de75ae5615270975718d0fff0 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 26 Apr 2022 15:36:50 +0200 Subject: [PATCH 08/20] Make dedup stats! --- pairtools/pairtools_dedup.py | 57 ++++++++++++++---------- pairtools/pairtools_stats.py | 86 +++++++++++++++++++++++++++--------- tests/test_stats.py | 2 +- 3 files changed, 101 insertions(+), 44 deletions(-) diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py index c2f472d0..cca5ce89 100644 --- a/pairtools/pairtools_dedup.py +++ b/pairtools/pairtools_dedup.py @@ -70,7 +70,15 @@ " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." " By default, statistics are not printed.", ) - +@click.option( + "--output-bytile-dups-stats", + type=str, + default="", + help="output file for duplicate statistics." + " If file exists, it will be open in the append mode." + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " By default, by-tile duplicate statistics are not printed.", +) ### Set the dedup method: @click.option( "--max-mismatch", @@ -233,6 +241,7 @@ def dedup( output_dups, output_unmapped, output_stats, + output_bytile_dups_stats, chunksize, carryover, max_mismatch, @@ -270,6 +279,7 @@ def dedup( output_dups, output_unmapped, output_stats, + output_bytile_dups_stats, chunksize, carryover, max_mismatch, @@ -303,6 +313,7 @@ def dedup_py( output_dups, output_unmapped, output_stats, + output_bytile_dups_stats, chunksize, carryover, max_mismatch, @@ -360,6 +371,17 @@ def dedup_py( else None ) + out_bytile_dups_stats_stream = ( + _fileio.auto_open( + output_bytile_dups_stats, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + if output_bytile_dups_stats + else None + ) + # generate empty PairCounter if stats output is requested: out_stat = PairCounter() if output_stats else None @@ -401,8 +423,8 @@ def dedup_py( outstream.writelines((l + "\n" for l in header)) if send_header_to_dup and outstream_dups and (outstream_dups != outstream): dups_header = header - if keep_parent_id and len(dups_header)>0: - dups_header= _headerops.append_columns(dups_header, ["parent_readID"]) + if keep_parent_id and len(dups_header) > 0: + dups_header = _headerops.append_columns(dups_header, ["parent_readID"]) outstream_dups.writelines((l + "\n" for l in dups_header)) if ( outstream_unmapped @@ -475,6 +497,9 @@ def dedup_py( if out_stat: out_stat.save(out_stats_stream) + if output_bytile_dups_stats: + out_stat.save_bytile_dups(out_bytile_dups_stats_stream) + if instream != sys.stdin: instream.close() @@ -495,7 +520,7 @@ def dedup_py( out_stats_stream.close() -### Cython deduplication #### +### Chunked dedup ### def streaming_dedup( in_stream, colnames, @@ -531,15 +556,14 @@ def streaming_dedup( n_proc=n_proc, ) - t0 = time.time() - N = 0 - for df_chunk in deduped_chunks: - N += df_chunk.shape[0] - # Write the stats if requested: if out_stat is not None: - out_stat.add_pairs_from_dataframe(df_chunk, unmapped_chrom=unmapped_chrom) + out_stat.add_pairs_from_dataframe( + df_chunk, + unmapped_chrom=unmapped_chrom, + analyse_bytile_dups=keep_parent_id, + ) # Define masks of unmapped and duplicated reads: mask_mapped = np.logical_and( @@ -572,11 +596,6 @@ def streaming_dedup( outstream, index=False, header=False, sep="\t" ) - t1 = time.time() - t = t1 - t0 - print(f"total time: {t}") - print(f"time per mln pairs: {t/N*1e6}") - def _dedup_stream( in_stream, @@ -838,9 +857,6 @@ def streaming_dedup_cython( strandDict = {} curMaxLen = max(MAX_LEN, dd.getLen()) - t0 = time.time() - N = 0 - instream = iter(instream) read_idx = 0 # read index to mark the parent readID while True: @@ -900,7 +916,6 @@ def streaming_dedup_cython( else: s1.append(fetchadd(cols[s1ind], strandDict)) s2.append(fetchadd(cols[s2ind], strandDict)) - N += 1 if (not stripline) or (len(c1) == curMaxLen): if keep_parent_id: res, parents = dd.push( @@ -997,10 +1012,6 @@ def streaming_dedup_cython( # all lines have been processed at this point. # streaming_dedup is over. - t1 = time.time() - t = t1 - t0 - print(f"total time: {t}") - print(f"time per mln pairs: {t/N*1e6}") def fetchadd(key, mydict): diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 5d467c11..1c8682d8 100755 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -28,14 +28,25 @@ " the end of the file.", ) @click.option( - "--analyse-by-tile-duplication", - is_flag=True, + "--analyse-bytile-dups", + type=str, + default="", help="If specified, will analyse by-tile duplication statistics to estimate" - " library complexity more accurately. Requires parent_readID column to be saved" - " by dedup (will ignore this option otherwise)", + " library complexity more accurately." + " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", +) +@click.option( + "--output-bytile-dups-stats", + type=str, + default="", + help="If specified, will analyse by-tile duplication statistics to estimate" + " library complexity more accurately and will save details to this path." + " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", ) @common_io_options -def stats(input_path, output, merge, analyse_by_tile_duplication, **kwargs): +def stats( + input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs +): """Calculate pairs statistics. INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. @@ -45,10 +56,19 @@ def stats(input_path, output, merge, analyse_by_tile_duplication, **kwargs): The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. """ - stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs) + stats_py( + input_path, + output, + merge, + analyse_bytile_dups, + output_bytile_dups_stats, + **kwargs, + ) -def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs): +def stats_py( + input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs +): if merge: do_merge(output, input_path, **kwargs) return @@ -77,25 +97,23 @@ def stats_py(input_path, output, merge, analyse_by_tile_duplication, **kwargs): header, body_stream = _headerops.get_header(instream) cols = _headerops.extract_column_names(header) - if analyse_by_tile_duplication and "parent_readID" not in cols: + if ( + analyse_bytile_dups or output_bytile_dups_stats + ) and "parent_readID" not in cols: warnings.warn( - "No 'parent_readID' column in the file, not generating duplicate" " stats." + "No 'parent_readID' column in the file, not generating duplicate stats." ) - analyse_by_tile_duplication = False - elif analyse_by_tile_duplication: - bytile_dups = pd.DataFrame([]) + analyse_bytile_dups = False + output_bytile_dups_stats = False # new stats class stuff would come here ... stats = PairCounter() # Collecting statistics for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000): stats.add_pairs_from_dataframe(chunk) - if analyse_by_tile_duplication: - dups = chunk.iloc[chunk["duplicate"]] - bytile_dups.add(analyse_duplicate_stats(dups), fill_value=0).astype(int) - if analyse_by_tile_duplication: - dups_by_tile_median = bytile_dups["dup_count"].median() * bytile_dups.shape[0] - stats._stat["total_dups_by_tile_median"] = dups_by_tile_median + + if output_bytile_dups_stats: + stats.save_bytile_dups() stats.calculate_summaries() # save statistics to file ... @@ -299,6 +317,11 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25) ("complexity_naive", 0), ] ) + self.bytile_dups = pd.DataFrame( + index=pd.MultiIndex( + levels=[[], []], codes=[[], []], names=["tile", "parent_tile"] + ) + ) def __getitem__(self, key): if isinstance(key, str): @@ -398,6 +421,10 @@ def calculate_summaries(self): self._stat["summary"]["complexity_naive"] = estimate_library_complexity( self._stat["total_mapped"], self._stat["total_dups"], 0 ) + if self.bytile_dups.shape[0] > 0: + self._stat["dups_by_tile_median"] = ( + self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0] + ) if "dups_by_tile_median" in self._stat: self._stat["summary"][ "complexity_dups_by_tile_median" @@ -575,7 +602,9 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): else: self._stat["total_single_sided_mapped"] += 1 - def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): + def add_pairs_from_dataframe( + self, df, unmapped_chrom="!", analyse_bytile_dups=False + ): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. Parameters @@ -616,7 +645,8 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): mask_dups = df_mapped["duplicate"] else: mask_dups = df_mapped["pair_type"] == "DD" - dups_count = mask_dups.sum() + dups = df_mapped[mask_dups] + dups_count = dups.shape[0] self._stat["total_dups"] += int(dups_count) self._stat["total_nodups"] += int(mapped_count - dups_count) @@ -650,6 +680,12 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): self._stat["cis_20kb+"] += int(np.sum(dist >= 20000)) self._stat["cis_40kb+"] += int(np.sum(dist >= 40000)) + ### Add by-tile dups + if analyse_bytile_dups and dups.shape[0] > 0: + self.bytile_dups = self.bytile_dups.add( + analyse_duplicate_stats(dups), fill_value=0 + ).astype(int) + def __add__(self, other): # both PairCounter are implied to have a list of common fields: # @@ -705,6 +741,7 @@ def __add__(self, other): self._stat["summary"]["complexity_dups_by_tile_median"] + other._stat["summary"]["complexity_dups_by_tile_median"] ) + # self.bytile_dups.add(other.bytile_dups, fill_value=0).astype(int) return sum_stat # we need this to be able to sum(list_of_PairCounters) @@ -789,6 +826,15 @@ def save(self, outstream): for k, v in self.flatten().items(): outstream.write("{}{}{}\n".format(k, self._SEP, v)) + def save_bytile_dups(self, outstream): + """save bytile duplication counts to a tab-delimited text file. + + Parameters + ---------- + outstream: file handle + """ + self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False) + if __name__ == "__main__": stats() diff --git a/tests/test_stats.py b/tests/test_stats.py index 2f906569..9a516bb7 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -9,7 +9,7 @@ def test_mock_pairsam(): - mock_pairsam_path = os.path.join(testdir, "data", "mock.stats.pairs") + mock_pairsam_path = os.path.join(testdir, "data", "mock.4stats.pairs") try: result = subprocess.check_output( ["python", "-m", "pairtools", "stats", mock_pairsam_path], From 0a0207982f5027733e0d852e040192b88b4f37b5 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 27 Apr 2022 10:35:51 +0200 Subject: [PATCH 09/20] Address comments --- pairtools/pairtools_stats.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 1c8682d8..25349e3b 100755 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -28,10 +28,9 @@ " the end of the file.", ) @click.option( - "--analyse-bytile-dups", - type=str, - default="", - help="If specified, will analyse by-tile duplication statistics to estimate" + "--bytile-dups/--no-bytile-dups", + default=False, + help="If enabled, will analyse by-tile duplication statistics to estimate" " library complexity more accurately." " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", ) @@ -176,6 +175,9 @@ def estimate_library_complexity(nseq, ndup, nopticaldup=0): Estimated complexity """ nseq = nseq - nopticaldup + if nseq == 0: + warnings.warn("Empty of fully duplicated library, can't estimate complexity") + return 0 ndup = ndup - nopticaldup u = (nseq - ndup) / nseq seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u @@ -413,10 +415,14 @@ def calculate_summaries(self): "cis_40kb+", ): self._stat["summary"][f"frac_{cis_count}"] = ( - self._stat[cis_count] / self._stat["total_nodups"] + (self._stat[cis_count] / self._stat["total_nodups"]) + if self._stat["total_nodups"] > 0 + else 0 ) self._stat["summary"]["frac_dups"] = ( - self._stat["total_dups"] / self._stat["total_mapped"] + (self._stat["total_dups"] / self._stat["total_mapped"]) + if self._stat["total_mapped"] > 0 + else 0 ) self._stat["summary"]["complexity_naive"] = estimate_library_complexity( self._stat["total_mapped"], self._stat["total_dups"], 0 From f0bb8ca26945ea531f5acb265cf7307ad6f31f58 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 27 Apr 2022 10:38:37 +0200 Subject: [PATCH 10/20] FIx argument name --- pairtools/pairtools_stats.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py index 25349e3b..f0dd71cd 100755 --- a/pairtools/pairtools_stats.py +++ b/pairtools/pairtools_stats.py @@ -43,9 +43,7 @@ " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", ) @common_io_options -def stats( - input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs -): +def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs): """Calculate pairs statistics. INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. @@ -56,17 +54,12 @@ def stats( The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. """ stats_py( - input_path, - output, - merge, - analyse_bytile_dups, - output_bytile_dups_stats, - **kwargs, + input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs, ) def stats_py( - input_path, output, merge, analyse_bytile_dups, output_bytile_dups_stats, **kwargs + input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs ): if merge: do_merge(output, input_path, **kwargs) @@ -96,13 +89,11 @@ def stats_py( header, body_stream = _headerops.get_header(instream) cols = _headerops.extract_column_names(header) - if ( - analyse_bytile_dups or output_bytile_dups_stats - ) and "parent_readID" not in cols: + if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols: warnings.warn( "No 'parent_readID' column in the file, not generating duplicate stats." ) - analyse_bytile_dups = False + bytile_dups = False output_bytile_dups_stats = False # new stats class stuff would come here ... stats = PairCounter() @@ -608,9 +599,7 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): else: self._stat["total_single_sided_mapped"] += 1 - def add_pairs_from_dataframe( - self, df, unmapped_chrom="!", analyse_bytile_dups=False - ): + def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. Parameters @@ -687,7 +676,7 @@ def add_pairs_from_dataframe( self._stat["cis_40kb+"] += int(np.sum(dist >= 40000)) ### Add by-tile dups - if analyse_bytile_dups and dups.shape[0] > 0: + if bytile_dups and dups.shape[0] > 0: self.bytile_dups = self.bytile_dups.add( analyse_duplicate_stats(dups), fill_value=0 ).astype(int) From 9a378fc27a1e8afffbc8fcd940728c3edce07588 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 27 Apr 2022 14:13:34 +0200 Subject: [PATCH 11/20] Split dedup and stats into lib and cli for 1.0 --- pairtools/__init__.py | 119 +--- pairtools/_dedup.pyx | 293 ---------- pairtools/cli/__init__.py | 206 +++++++ pairtools/cli/dedup.py | 511 +++++++++++++++++ pairtools/cli/stats.py | 117 ++++ pairtools/pairtools_dedup.py | 1025 ---------------------------------- pairtools/pairtools_stats.py | 835 --------------------------- 7 files changed, 837 insertions(+), 2269 deletions(-) delete mode 100644 pairtools/_dedup.pyx create mode 100644 pairtools/cli/__init__.py create mode 100644 pairtools/cli/dedup.py create mode 100755 pairtools/cli/stats.py delete mode 100644 pairtools/pairtools_dedup.py delete mode 100755 pairtools/pairtools_stats.py diff --git a/pairtools/__init__.py b/pairtools/__init__.py index ceda0d19..e4aa6788 100644 --- a/pairtools/__init__.py +++ b/pairtools/__init__.py @@ -1,128 +1,15 @@ -# -*- coding: utf-8 -*- """ pairtools ~~~~~~~~~ CLI tools to process mapped Hi-C data -:copyright: (c) 2017-2021 Open2C +:copyright: (c) 2017-2022 Open2C :author: Open2C :license: MIT """ -__version__ = '0.3.1-dev.1' +__version__ = "1.0.0-dev1" - -import click -import functools -import sys - -CONTEXT_SETTINGS = { - 'help_option_names': ['-h', '--help'], -} - - -@click.version_option(version=__version__) -@click.group(context_settings=CONTEXT_SETTINGS) -@click.option( - '--post-mortem', - help="Post mortem debugging", - is_flag=True, - default=False -) - -@click.option( - '--output-profile', - help="Profile performance with Python cProfile and dump the statistics " - "into a binary file", - type=str, - default='' -) -def cli(post_mortem, output_profile): - '''Flexible tools for Hi-C data processing. - - All pairtools have a few common options, which should be typed _before_ - the command name. - - ''' - if post_mortem: - import traceback - try: - import ipdb as pdb - except ImportError: - import pdb - def _excepthook(exc_type, value, tb): - traceback.print_exception(exc_type, value, tb) - print() - pdb.pm() - sys.excepthook = _excepthook - - if output_profile: - import cProfile - import atexit - - pr = cProfile.Profile() - pr.enable() - - def _atexit_profile_hook(): - pr.disable() - pr.dump_stats(output_profile) - - atexit.register(_atexit_profile_hook) - - -def common_io_options(func): - @click.option( - '--nproc-in', - type=int, - default=3, - show_default=True, - help='Number of processes used by the auto-guessed input decompressing command.' - ) - @click.option( - '--nproc-out', - type=int, - default=8, - show_default=True, - help='Number of processes used by the auto-guessed output compressing command.' - ) - @click.option( - '--cmd-in', - type=str, - default=None, - help='A command to decompress the input file. ' - 'If provided, fully overrides the auto-guessed command. ' - 'Does not work with stdin and pairtools parse. ' - 'Must read input from stdin and print output into stdout. ' - 'EXAMPLE: pbgzip -dc -n 3' - ) - @click.option( - '--cmd-out', - type=str, - default=None, - help='A command to compress the output file. ' - 'If provided, fully overrides the auto-guessed command. ' - 'Does not work with stdout. ' - 'Must read input from stdin and print output into stdout. ' - 'EXAMPLE: pbgzip -c -n 8' - ) - - @functools.wraps(func) - def wrapper(*args, **kwargs): - return func(*args, **kwargs) - return wrapper - -from .pairtools_dedup import dedup -from .pairtools_sort import sort -from .pairtools_flip import flip -from .pairtools_merge import merge -from .pairtools_markasdup import markasdup -from .pairtools_select import select -from .pairtools_split import split -from .pairtools_restrict import restrict -from .pairtools_phase import phase -from .pairtools_parse import parse -from .pairtools_stats import stats -from .pairtools_sample import sample -from .pairtools_filterbycov import filterbycov +# from . import lib diff --git a/pairtools/_dedup.pyx b/pairtools/_dedup.pyx deleted file mode 100644 index 2c08db9a..00000000 --- a/pairtools/_dedup.pyx +++ /dev/null @@ -1,293 +0,0 @@ -""" -Legacy code: -``mark_duplicates`` is an offline method that finds duplicates in a given -input dataset. - -For other applications on much larger datasets you may consider an online -method ``OnlineDuplicateDetector`` which is implemented as a class. -Note that for both methods data types are fixed: - * chromosomes are int32 - * position is int32 - * strand is int32, which is basically the same as C type "char". -""" -import numpy as np -import cython - -cimport numpy as np -cimport cython - - -### Legacy offfline deduplicator: -def mark_duplicates( - cython.int [:] c1, - cython.int [:] c2, - cython.int [:] p1, - cython.int [:] p2, - cython.int [:] s1, - cython.int [:] s2, - #uncomment for testing probably since it will do boundary check - #np.ndarray[np.int16_t, ndim=1]c2, - #np.ndarray[np.int32_t, ndim=1] p1, - #np.ndarray[np.int32_t, ndim=1] p2, - #np.ndarray[np.int8_t, ndim=1] s1, - #np.ndarray[np.int8_t, ndim=1] s2, - int max_mismatch=3, - method = "sum"): - """ - Mark duplicates, allowing for some mismatch on the both sides of the molecule. - You can use it to filter single-cell data as well by setting max_mismatch to - 500bp or even larger. It works as fast as we could make it. - This methods scans through a list of reads. It then flags duplicates, - which are defined as molecules, with both ends located within `max_mismatch` - bp from each other. - - There are two ways define duplicates: - "max": two reads are duplicates if the mismatch of the genomic locations - of both ends is less-or-equal "max_mismatch" - "sum": two reads are duplicates if the sum of the mismatches of the either - ends of the molecule is less-or-equal "max_mismatch" - Other methods could be added below by editing the code - - Parameters - ---------- - c1, c2 : int32 array - chromosome IDs - - p1, p2 : int32 arrays - positions - - s1, s2 : int32 (or bool) arrays - strands - - max_mismatch : int - method : "sum" or "max" - use the sum of mismatches, or the max of the two - - Returns - ------- - mask : int8 array - A binary mask, where 1 denotes that a read is a duplicate. - Notes - ----- - Arrays MUST be ordered by (c1, p1) - - """ - cdef int N = len(c1) - cdef np.ndarray[np.int8_t, ndim=1] mask = np.zeros(N, dtype=np.int8) - cdef int low = 0 - cdef int high = 1 - cdef int extraCondition - cdef int methodid - - if method == "max": - methodid = 0 - elif method == "sum": - methodid = 1 - else: - raise ValueError('method should be "sum" or "max"') - - while True: - assert False - if low == N: - break - - if high == N: - low += 1 - high = low + 1 - continue - - if mask[low] == 1: - low += 1 - high = low+1 - continue - - # if high already removed, just continue - if mask[high] == 1: - high += 1 - continue - - # if we jumped too far, continue - if (c1[high] != c1[low]) or (p1[high] - p1[low] > max_mismatch) or (p1[high] < p1[low]) or (c2[high] != c2[low]): - low += 1 - high = low + 1 # restart high - continue - - if methodid == 0: - extraCondition = (max(abs(p1[low] - p1[high]), - abs(p2[low] - p2[high])) <= max_mismatch) - elif methodid == 1: - extraCondition = (abs(p1[low] - p1[high]) + - abs(p2[low] - p2[high]) <= max_mismatch) - else: - raise ValueError( - "Unknown method id, this should not happen. " - "Check code of this function.") - - if ((c2[low] == c2[high]) and (s1[low] == s1[high]) and - (s2[low] == s2[high]) and extraCondition): - mask[high] = 1 - high += 1 - continue - high += 1 - - return mask - - -### Online deduplicator used in pairtools.dedup Cython: -cdef class OnlineDuplicateDetector(object): - cdef cython.int [:] c1 - cdef cython.int [:] c2 - cdef cython.int [:] p1 - cdef cython.int [:] p2 - cdef cython.int [:] s1 - cdef cython.int [:] s2 - cdef cython.char [:] rm - cdef cython.int [:] parent_idxs - cdef int methodid - cdef int low - cdef int high - cdef int N - cdef int max_mismatch - cdef int returnData - cdef int keep_parent_id - - def __init__(self, method, max_mismatch, returnData=False, keep_parent_id=False): - if returnData == False: - self.returnData = 0 - else: - self.returnData = 1 - if keep_parent_id == False: - self.keep_parent_id = 0 - else: - self.keep_parent_id = 1 - self.parent_idxs = np.zeros(0, np.int32) - - self.N = 0 - self.c1 = np.zeros(0, np.int32) - self.c2 = np.zeros(0, np.int32) - self.p1 = np.zeros(0, np.int32) - self.p2 = np.zeros(0, np.int32) - self.s1 = np.zeros(0, np.int32) - self.s2 = np.zeros(0, np.int32) - - self.rm = np.zeros(0, np.int8) - if method == "max": - self.methodid = 0 - elif method == "sum": - self.methodid = 1 - else: - raise ValueError('method should be "sum" or "max"') - self.max_mismatch = int(max_mismatch) - self.low = 0 - self.high = 1 - - def _shrink(self): - if self.returnData == 1: - firstret = self.rm[:self.low] - retainMask = (np.asarray(firstret) == False) - del firstret - ret = [] - for ar in [self.c1, self.c2, self.p1, self.p2, self.s1, self.s2]: - ret.append(np.asarray(ar)[:self.low][retainMask]) - self.c1 = self.c1[self.low:] - self.c2 = self.c2[self.low:] - self.p1 = self.p1[self.low:] - self.p2 = self.p2[self.low:] - self.s1 = self.s1[self.low:] - self.s2 = self.s2[self.low:] - pastrm = self.rm[:self.low] - self.rm = self.rm[self.low:] - self.high = self.high-self.low - self.N = self.N - self.low - if self.returnData == 1: - self.low = 0 - return ret - if self.keep_parent_id == 1: # Return parent readIDs alongside with duplicates mask: - pastidx = self.parent_idxs[:self.low] - self.low = 0 - return pastrm, pastidx - return pastrm - - def _run(self, finish=False): - cdef int finishing = 0 - cdef int extraCondition - - if finish: - finishing = 1 - - while True: - if self.low == self.N: - break - - if self.high == self.N: - if finishing == 1: - self.low += 1 - self.high = self.low + 1 - continue - else: - break - - if self.rm[self.low] == 1: - self.low += 1 - self.high = self.low+1 - continue - - # if high already removed, just continue - if self.rm[self.high] == 1: - self.high += 1 - continue - - # if we jumped too far, continue - if ((self.c1[self.high] != self.c1[self.low]) or - (self.p1[self.high] - self.p1[self.low] > self.max_mismatch) or - (self.p1[self.high] - self.p1[self.low] < 0 )): - self.low += 1 - self.high = self.low + 1 # restart high - continue - - if self.methodid == 0: - extraCondition = max( - abs(self.p1[self.low] - self.p1[self.high]), - abs(self.p2[self.low] - self.p2[self.high])) <= self.max_mismatch - elif self.methodid == 1: - # sum of distances <= max_mismatch - extraCondition = ( - abs(self.p1[self.low] - self.p1[self.high]) + - abs(self.p2[self.low] - self.p2[self.high]) <= self.max_mismatch - ) - else: - raise ValueError( - "Unknown method id, this should not happen. " - "Check code of this function.") - - if ((self.c2[self.low] == self.c2[self.high]) and - (self.s1[self.low] == self.s1[self.high]) and - (self.s2[self.low] == self.s2[self.high]) and - extraCondition): - self.rm[self.high] = 1 - if self.keep_parent_id == 1: - self.parent_idxs[self.high] = self.low - self.high += 1 - continue - self.high += 1 - - return self._shrink() - - def push(self, c1, c2, p1, p2, s1, s2): - self.c1 = np.concatenate([self.c1, c1]) - self.c2 = np.concatenate([self.c2, c2]) - self.p1 = np.concatenate([self.p1, p1]) - self.p2 = np.concatenate([self.p2, p2]) - self.s1 = np.concatenate([self.s1, s1]) - self.s2 = np.concatenate([self.s2, s2]) - self.rm = np.concatenate([self.rm, np.zeros(len(c1), dtype=np.int8)]) - if self.keep_parent_id == 1: - self.parent_idxs = np.concatenate([self.parent_idxs, np.zeros(len(c1), dtype=np.int32)]) - self.N = self.N + len(c1) - return self._run(finish=False) - - def finish(self): - return self._run(finish=True) - - def getLen(self): - return int(self.N) \ No newline at end of file diff --git a/pairtools/cli/__init__.py b/pairtools/cli/__init__.py new file mode 100644 index 00000000..6fa156da --- /dev/null +++ b/pairtools/cli/__init__.py @@ -0,0 +1,206 @@ +# -*- coding: utf-8 -*- + +import click +import functools +import sys +from .. import __version__ +import logging +from .._logging import get_logger + + +CONTEXT_SETTINGS = { + "help_option_names": ["-h", "--help"], +} + + +@click.version_option(version=__version__) +@click.group(context_settings=CONTEXT_SETTINGS) +@click.option( + "--post-mortem", help="Post mortem debugging", is_flag=True, default=False +) +@click.option( + "--output-profile", + help="Profile performance with Python cProfile and dump the statistics " + "into a binary file", + type=str, + default="", +) +@click.option("-v", "--verbose", help="Verbose logging.", count=True) +@click.option( + "-d", + "--debug", + help="On error, drop into the post-mortem debugger shell.", + is_flag=True, + default=False, +) +def cli(post_mortem, output_profile, verbose, debug): + """Flexible tools for Hi-C data processing. + + All pairtools have a few common options, which should be typed _before_ + the command name. + + """ + if post_mortem: + import traceback + + try: + import ipdb as pdb + except ImportError: + import pdb + + def _excepthook(exc_type, value, tb): + traceback.print_exception(exc_type, value, tb) + print() + pdb.pm() + + sys.excepthook = _excepthook + + if output_profile: + import cProfile + import atexit + + pr = cProfile.Profile() + pr.enable() + + def _atexit_profile_hook(): + pr.disable() + pr.dump_stats(output_profile) + + atexit.register(_atexit_profile_hook) + + # Initialize logging to stderr + logging.basicConfig(stream=sys.stderr) + logging.captureWarnings(True) + root_logger = get_logger() + + # Set verbosity level + if verbose > 0: + root_logger.setLevel(logging.DEBUG) + if verbose > 1: # pragma: no cover + try: + import psutil + import atexit + + @atexit.register + def process_dump_at_exit(): + process_attrs = [ + "cmdline", + # 'connections', + "cpu_affinity", + "cpu_num", + "cpu_percent", + "cpu_times", + "create_time", + "cwd", + # 'environ', + "exe", + # 'gids', + "io_counters", + "ionice", + "memory_full_info", + # 'memory_info', + # 'memory_maps', + "memory_percent", + "name", + "nice", + "num_ctx_switches", + "num_fds", + "num_threads", + "open_files", + "pid", + "ppid", + "status", + "terminal", + "threads", + # 'uids', + "username", + ] + p = psutil.Process() + info_ = p.as_dict(process_attrs, ad_value="") + for key in process_attrs: + root_logger.debug("PSINFO:'{}': {}".format(key, info_[key])) + + except ImportError: + root_logger.warning("Install psutil to see process information.") + + else: + root_logger.setLevel(logging.INFO) + + # Set hook for postmortem debugging + if debug: # pragma: no cover + import traceback + + try: + import ipdb as pdb + except ImportError: + import pdb + + def _excepthook(exc_type, value, tb): + traceback.print_exception(exc_type, value, tb) + print() + pdb.pm() + + sys.excepthook = _excepthook + + +def common_io_options(func): + @click.option( + "--nproc-in", + type=int, + default=3, + show_default=True, + help="Number of processes used by the auto-guessed input decompressing command.", + ) + @click.option( + "--nproc-out", + type=int, + default=8, + show_default=True, + help="Number of processes used by the auto-guessed output compressing command.", + ) + @click.option( + "--cmd-in", + type=str, + default=None, + help="A command to decompress the input file. " + "If provided, fully overrides the auto-guessed command. " + "Does not work with stdin and pairtools parse. " + "Must read input from stdin and print output into stdout. " + "EXAMPLE: pbgzip -dc -n 3", + ) + @click.option( + "--cmd-out", + type=str, + default=None, + help="A command to compress the output file. " + "If provided, fully overrides the auto-guessed command. " + "Does not work with stdout. " + "Must read input from stdin and print output into stdout. " + "EXAMPLE: pbgzip -c -n 8", + ) + @functools.wraps(func) + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + return wrapper + + +from . import ( + dedup, + sort, + flip, + merge, + markasdup, + select, + split, + restrict, + phase, + parse, + parse2, + stats, + sample, + filterbycov, + header, + scaling, +) + diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py new file mode 100644 index 00000000..dd7ea14d --- /dev/null +++ b/pairtools/cli/dedup.py @@ -0,0 +1,511 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import ast +import pathlib + +# from distutils.log import warn +# import warnings + +from ..lib import fileio, pairsam_format, headerops +from . import cli, common_io_options + +import click + +from ..lib.dedup import streaming_dedup, streaming_dedup_cython +from ..lib.stats import PairCounter + +UTIL_NAME = "pairtools_dedup" + + +@cli.command() +@click.argument("pairs_path", type=str, required=False) + +### Output files: +@click.option( + "-o", + "--output", + type=str, + default="", + help="output file for pairs after duplicate removal." + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " By default, the output is printed into stdout.", +) +@click.option( + "--output-dups", + type=str, + default="", + help="output file for duplicated pairs. " + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " If the path is the same as in --output or -, output duplicates together " + " with deduped pairs. By default, duplicates are dropped.", +) +@click.option( + "--output-unmapped", + type=str, + default="", + help="output file for unmapped pairs. " + "If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed. " + "If the path is the same as in --output or -, output unmapped pairs together " + "with deduped pairs. If the path is the same as --output-dups, output " + "unmapped reads together with dups. By default, unmapped pairs are dropped.", +) +@click.option( + "--output-stats", + type=str, + default="", + help="output file for duplicate statistics." + " If file exists, it will be open in the append mode." + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " By default, statistics are not printed.", +) +@click.option( + "--output-bytile-dups-stats", + type=str, + default="", + help="output file for duplicate statistics." + " If file exists, it will be open in the append mode." + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " By default, by-tile duplicate statistics are not printed.", +) +### Set the dedup method: +@click.option( + "--max-mismatch", + type=int, + default=3, + show_default=True, + help="Pairs with both sides mapped within this distance (bp) from each " + "other are considered duplicates. ", +) +@click.option( + "--method", + type=click.Choice(["max", "sum"]), + default="max", + help="define the mismatch as either the max or the sum of the mismatches of" + "the genomic locations of the both sides of the two compared molecules", + show_default=True, +) +@click.option( + "--backend", + type=click.Choice(["scipy", "sklearn", "cython"]), + default="scipy", + help="What backend to use: scipy and sklearn are based on KD-trees," + " cython is online indexed list-based algorithm." + " With cython backend, duplication is not transitive with non-zero max mismatch " + " (e.g. pairs A and B are duplicates, and B and C are duplicates, then A and C are " + " not necessary duplicates of each other), while with scipy and sklearn it's " + " transitive (i.e. A and C are necessarily duplicates)." + " Cython is the original version used in pairtools since its beginning." + " It is available for backwards compatibility and to allow specification of the" + " column order." + " Now the default scipy backend is generally the fastest, and with chunksize below" + " 1 mln has the lowest memory requirements." + # " 'cython' is deprecated and provided for backwards compatibility", +) + +### Scipy and sklearn-specific options: +@click.option( + "--chunksize", + type=int, + default=100_000, + show_default=True, + help="Number of pairs in each chunk. Reduce for lower memory footprint." + " Below 10,000 performance starts suffering significantly and the algorithm might" + " miss a few duplicates with non-zero --max-mismatch." + " Only works with '--backend scipy or sklearn'", +) +@click.option( + "--carryover", + type=int, + default=100, + show_default=True, + help="Number of deduped pairs to carry over from previous chunk to the new chunk" + " to avoid breaking duplicate clusters." + " Only works with '--backend scipy or sklearn'", +) +@click.option( + "-p", + "--n-proc", + type=int, + default=1, + help="Number of cores to use. Only applies with sklearn backend." + "Still needs testing whether it is ever useful.", +) + +### Output options: +@click.option( + "--mark-dups", + is_flag=True, + help='If specified, duplicate pairs are marked as DD in "pair_type" and ' + "as a duplicate in the sam entries.", +) +@click.option( + "--keep-parent-id", + is_flag=True, + help="If specified, duplicate pairs are marked with the readID of the retained" + " deduped read in the 'parent_readID' field.", +) +@click.option( + "--extra-col-pair", + nargs=2, + # type=click.Tuple([str, str]), + multiple=True, + help="Extra columns that also must match for two pairs to be marked as " + "duplicates. Can be either provided as 0-based column indices or as column " + 'names (requires the "#columns" header field). The option can be provided ' + "multiple times if multiple column pairs must match. " + 'Example: --extra-col-pair "phase1" "phase2"', +) + +### Input options: +@click.option( + "--sep", + type=str, + default=pairsam_format.PAIRSAM_SEP_ESCAPE, + help=r"Separator (\t, \v, etc. characters are " "supported, pass them in quotes) ", +) +@click.option( + "--comment-char", type=str, default="#", help="The first character of comment lines" +) +@click.option( + "--send-header-to", + type=click.Choice(["dups", "dedup", "both", "none"]), + default="both", + help="Which of the outputs should receive header and comment lines", +) +@click.option( + "--c1", + type=int, + default=pairsam_format.COL_C1, + help=f"Chrom 1 column; default {pairsam_format.COL_C1}" + " Only works with '--backend cython'", +) +@click.option( + "--c2", + type=int, + default=pairsam_format.COL_C2, + help=f"Chrom 2 column; default {pairsam_format.COL_C2}" + " Only works with '--backend cython'", +) +@click.option( + "--p1", + type=int, + default=pairsam_format.COL_P1, + help=f"Position 1 column; default {pairsam_format.COL_P1}" + " Only works with '--backend cython'", +) +@click.option( + "--p2", + type=int, + default=pairsam_format.COL_P2, + help=f"Position 2 column; default {pairsam_format.COL_P2}" + " Only works with '--backend cython'", +) +@click.option( + "--s1", + type=int, + default=pairsam_format.COL_S1, + help=f"Strand 1 column; default {pairsam_format.COL_S1}" + " Only works with '--backend cython'", +) +@click.option( + "--s2", + type=int, + default=pairsam_format.COL_S2, + help=f"Strand 2 column; default {pairsam_format.COL_S2}" + " Only works with '--backend cython'", +) +@click.option( + "--unmapped-chrom", + type=str, + default=pairsam_format.UNMAPPED_CHROM, + help="Placeholder for a chromosome on an unmapped side; default {}".format( + pairsam_format.UNMAPPED_CHROM + ), +) +@common_io_options +def dedup( + pairs_path, + output, + output_dups, + output_unmapped, + output_stats, + output_bytile_dups_stats, + chunksize, + carryover, + max_mismatch, + method, + sep, + comment_char, + send_header_to, + c1, + c2, + p1, + p2, + s1, + s2, + unmapped_chrom, + mark_dups, + extra_col_pair, + keep_parent_id, + backend, + n_proc, + **kwargs, +): + """Find and remove PCR/optical duplicates. + + Find PCR duplicates in an upper-triangular flipped sorted pairs/pairsam + file. Allow for a +/-N bp mismatch at each side of duplicated molecules. + + PAIRS_PATH : input triu-flipped sorted .pairs or .pairsam file. If the + path ends with .gz/.lz4, the input is decompressed by bgzip/lz4c. + By default, the input is read from stdin. + """ + + dedup_py( + pairs_path, + output, + output_dups, + output_unmapped, + output_stats, + output_bytile_dups_stats, + chunksize, + carryover, + max_mismatch, + method, + sep, + comment_char, + send_header_to, + c1, + c2, + p1, + p2, + s1, + s2, + unmapped_chrom, + mark_dups, + extra_col_pair, + keep_parent_id, + backend, + n_proc, + **kwargs, + ) + + +if __name__ == "__main__": + dedup() + + +def dedup_py( + pairs_path, + output, + output_dups, + output_unmapped, + output_stats, + output_bytile_dups_stats, + chunksize, + carryover, + max_mismatch, + method, + sep, + comment_char, + send_header_to, + c1, + c2, + p1, + p2, + s1, + s2, + unmapped_chrom, + mark_dups, + extra_col_pair, + keep_parent_id, + backend, + n_proc, + **kwargs, +): + + sep = ast.literal_eval('"""' + sep + '"""') + send_header_to_dedup = send_header_to in ["both", "dedup"] + send_header_to_dup = send_header_to in ["both", "dups"] + + instream = ( + fileio.auto_open( + pairs_path, + mode="r", + nproc=kwargs.get("nproc_in"), + command=kwargs.get("cmd_in", None), + ) + if pairs_path + else sys.stdin + ) + outstream = ( + fileio.auto_open( + output, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + if output + else sys.stdout + ) + out_stats_stream = ( + fileio.auto_open( + output_stats, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + if output_stats + else None + ) + + out_bytile_dups_stats_stream = ( + fileio.auto_open( + output_bytile_dups_stats, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + if output_bytile_dups_stats + else None + ) + + # generate empty PairCounter if stats output is requested: + out_stat = PairCounter() if output_stats else None + + if not output_dups: + outstream_dups = None + elif output_dups == "-" or ( + pathlib.Path(output_dups).absolute() == pathlib.Path(output).absolute() + ): + outstream_dups = outstream + else: + outstream_dups = fileio.auto_open( + output_dups, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + + if not output_unmapped: + outstream_unmapped = None + elif output_unmapped == "-" or ( + pathlib.Path(output_unmapped).absolute() == pathlib.Path(output).absolute() + ): + outstream_unmapped = outstream + elif ( + pathlib.Path(output_unmapped).absolute() == pathlib.Path(output_dups).absolute() + ): + outstream_unmapped = outstream_dups + else: + outstream_unmapped = fileio.auto_open( + output_unmapped, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + + header, body_stream = headerops.get_header(instream) + header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME) + if send_header_to_dedup: + outstream.writelines((l + "\n" for l in header)) + if send_header_to_dup and outstream_dups and (outstream_dups != outstream): + dups_header = header + if keep_parent_id and len(dups_header) > 0: + dups_header = headerops.append_columns(dups_header, ["parent_readID"]) + outstream_dups.writelines((l + "\n" for l in dups_header)) + if ( + outstream_unmapped + and (outstream_unmapped != outstream) + and (outstream_unmapped != outstream_dups) + ): + outstream_unmapped.writelines((l + "\n" for l in header)) + + column_names = headerops.extract_column_names(header) + extra_cols1 = [] + extra_cols2 = [] + if extra_col_pair is not None: + for col1, col2 in extra_col_pair: + extra_cols1.append(column_names[col1] if col1.isdigit() else col1) + extra_cols2.append(column_names[col2] if col2.isdigit() else col2) + + if backend == "cython": + # warnings.warn( + # "'cython' backend is deprecated and provided only" + # " for backwards compatibility", + # DeprecationWarning, + # ) + extra_cols1 = [column_names.index(col) for col in extra_cols1] + extra_cols2 = [column_names.index(col) for col in extra_cols2] + streaming_dedup_cython( + method, + max_mismatch, + sep, + c1, + c2, + p1, + p2, + s1, + s2, + extra_cols1, + extra_cols2, + unmapped_chrom, + body_stream, + outstream, + outstream_dups, + outstream_unmapped, + out_stat, + mark_dups, + keep_parent_id, + ) + elif backend in ("scipy", "sklearn"): + streaming_dedup( + in_stream=instream, + colnames=column_names, + chunksize=chunksize, + carryover=carryover, + method=method, + mark_dups=mark_dups, + max_mismatch=max_mismatch, + extra_col_pairs=list(extra_col_pair), + keep_parent_id=keep_parent_id, + unmapped_chrom=unmapped_chrom, + comment_char=comment_char, + outstream=outstream, + outstream_dups=outstream_dups, + outstream_unmapped=outstream_unmapped, + out_stat=out_stat, + backend=backend, + n_proc=n_proc, + ) + else: + raise ValueError("Unknown backend") + + # save statistics to a file if it was requested: + if out_stat: + out_stat.save(out_stats_stream) + + if output_bytile_dups_stats: + out_stat.save_bytile_dups(out_bytile_dups_stats_stream) + + if instream != sys.stdin: + instream.close() + + if outstream != sys.stdout: + outstream.close() + + if outstream_dups and (outstream_dups != outstream): + outstream_dups.close() + + if ( + outstream_unmapped + and (outstream_unmapped != outstream) + and (outstream_unmapped != outstream_dups) + ): + outstream_unmapped.close() + + if out_stats_stream: + out_stats_stream.close() + diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py new file mode 100755 index 00000000..4fb10935 --- /dev/null +++ b/pairtools/cli/stats.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import io +import sys +import click +import warnings + +import pandas as pd +from ..lib import fileio, pairsam_format, headerops +from . import cli, common_io_options + +from ..lib.stats import PairCounter, do_merge + +UTIL_NAME = "pairtools_stats" + + +@cli.command() +@click.argument("input_path", type=str, nargs=-1, required=False) +@click.option("-o", "--output", type=str, default="", help="output stats tsv file.") +@click.option( + "--merge", + is_flag=True, + help="If specified, merge multiple input stats files instead of calculating" + " statistics of a .pairs/.pairsam file. Merging is performed via summation of" + " all overlapping statistics. Non-overlapping statistics are appended to" + " the end of the file.", +) +@click.option( + "--bytile-dups/--no-bytile-dups", + default=False, + help="If enabled, will analyse by-tile duplication statistics to estimate" + " library complexity more accurately." + " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", +) +@click.option( + "--output-bytile-dups-stats", + type=str, + default="", + help="If specified, will analyse by-tile duplication statistics to estimate" + " library complexity more accurately and will save details to this path." + " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", +) +@common_io_options +def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs): + """Calculate pairs statistics. + + INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. + If not provided, the input is read from stdin. + If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number + of stats files to merge. + + The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. + """ + stats_py( + input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs, + ) + + +def stats_py( + input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs +): + if merge: + do_merge(output, input_path, **kwargs) + return + + instream = ( + fileio.auto_open( + input_path[0], + mode="r", + nproc=kwargs.get("nproc_in"), + command=kwargs.get("cmd_in", None), + ) + if input_path + else sys.stdin + ) + outstream = ( + fileio.auto_open( + output, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + if output + else sys.stdout + ) + + header, body_stream = headerops.get_header(instream) + cols = headerops.extract_column_names(header) + + if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols: + warnings.warn( + "No 'parent_readID' column in the file, not generating duplicate stats." + ) + bytile_dups = False + output_bytile_dups_stats = False + # new stats class stuff would come here ... + stats = PairCounter() + + # Collecting statistics + for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000): + stats.add_pairs_from_dataframe(chunk) + + if output_bytile_dups_stats: + stats.save_bytile_dups() + stats.calculate_summaries() + + # save statistics to file ... + stats.save(outstream) + + if instream != sys.stdin: + instream.close() + if outstream != sys.stdout: + outstream.close() + + +if __name__ == "__main__": + stats() diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py deleted file mode 100644 index cca5ce89..00000000 --- a/pairtools/pairtools_dedup.py +++ /dev/null @@ -1,1025 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from distutils.log import warn -import sys -import ast -import warnings -import pathlib - -import click - -import numpy as np -import pandas as pd - -import scipy.spatial -from scipy.sparse import coo_matrix -from scipy.sparse.csgraph import connected_components - -from . import _dedup, _fileio, _pairsam_format, _headerops, cli, common_io_options -from .pairtools_markasdup import mark_split_pair_as_dup -from .pairtools_stats import PairCounter - - -import time - -UTIL_NAME = "pairtools_dedup" - -# you don't need to load more than 10k lines at a time b/c you get out of the -# CPU cache, so this parameter is not adjustable -MAX_LEN = 10000 - - -@cli.command() -@click.argument("pairs_path", type=str, required=False) - -### Output files: -@click.option( - "-o", - "--output", - type=str, - default="", - help="output file for pairs after duplicate removal." - " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." - " By default, the output is printed into stdout.", -) -@click.option( - "--output-dups", - type=str, - default="", - help="output file for duplicated pairs. " - " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." - " If the path is the same as in --output or -, output duplicates together " - " with deduped pairs. By default, duplicates are dropped.", -) -@click.option( - "--output-unmapped", - type=str, - default="", - help="output file for unmapped pairs. " - "If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed. " - "If the path is the same as in --output or -, output unmapped pairs together " - "with deduped pairs. If the path is the same as --output-dups, output " - "unmapped reads together with dups. By default, unmapped pairs are dropped.", -) -@click.option( - "--output-stats", - type=str, - default="", - help="output file for duplicate statistics." - " If file exists, it will be open in the append mode." - " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." - " By default, statistics are not printed.", -) -@click.option( - "--output-bytile-dups-stats", - type=str, - default="", - help="output file for duplicate statistics." - " If file exists, it will be open in the append mode." - " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." - " By default, by-tile duplicate statistics are not printed.", -) -### Set the dedup method: -@click.option( - "--max-mismatch", - type=int, - default=3, - show_default=True, - help="Pairs with both sides mapped within this distance (bp) from each " - "other are considered duplicates. ", -) -@click.option( - "--method", - type=click.Choice(["max", "sum"]), - default="max", - help="define the mismatch as either the max or the sum of the mismatches of" - "the genomic locations of the both sides of the two compared molecules", - show_default=True, -) -@click.option( - "--backend", - type=click.Choice(["scipy", "sklearn", "cython"]), - default="scipy", - help="What backend to use: scipy and sklearn are based on KD-trees," - " cython is online indexed list-based algorithm." - " With cython backend, duplication is not transitive with non-zero max mismatch " - " (e.g. pairs A and B are duplicates, and B and C are duplicates, then A and C are " - " not necessary duplicates of each other), while with scipy and sklearn it's " - " transitive (i.e. A and C are necessarily duplicates)." - " Cython is the original version used in pairtools since its beginning." - " It is available for backwards compatibility and to allow specification of the" - " column order." - " Now the default scipy backend is generally the fastest, and with chunksize below" - " 1 mln has the lowest memory requirements." - # " 'cython' is deprecated and provided for backwards compatibility", -) - -### Scipy and sklearn-specific options: -@click.option( - "--chunksize", - type=int, - default=100_000, - show_default=True, - help="Number of pairs in each chunk. Reduce for lower memory footprint." - " Below 10,000 performance starts suffering significantly and the algorithm might" - " miss a few duplicates with non-zero --max-mismatch." - " Only works with '--backend scipy or sklearn'", -) -@click.option( - "--carryover", - type=int, - default=100, - show_default=True, - help="Number of deduped pairs to carry over from previous chunk to the new chunk" - " to avoid breaking duplicate clusters." - " Only works with '--backend scipy or sklearn'", -) -@click.option( - "-p", - "--n-proc", - type=int, - default=1, - help="Number of cores to use. Only applies with sklearn backend." - "Still needs testing whether it is ever useful.", -) - -### Output options: -@click.option( - "--mark-dups", - is_flag=True, - help='If specified, duplicate pairs are marked as DD in "pair_type" and ' - "as a duplicate in the sam entries.", -) -@click.option( - "--keep-parent-id", - is_flag=True, - help="If specified, duplicate pairs are marked with the readID of the retained" - " deduped read in the 'parent_readID' field.", -) -@click.option( - "--extra-col-pair", - nargs=2, - # type=click.Tuple([str, str]), - multiple=True, - help="Extra columns that also must match for two pairs to be marked as " - "duplicates. Can be either provided as 0-based column indices or as column " - 'names (requires the "#columns" header field). The option can be provided ' - "multiple times if multiple column pairs must match. " - 'Example: --extra-col-pair "phase1" "phase2"', -) - -### Input options: -@click.option( - "--sep", - type=str, - default=_pairsam_format.PAIRSAM_SEP_ESCAPE, - help=r"Separator (\t, \v, etc. characters are " "supported, pass them in quotes) ", -) -@click.option( - "--comment-char", type=str, default="#", help="The first character of comment lines" -) -@click.option( - "--send-header-to", - type=click.Choice(["dups", "dedup", "both", "none"]), - default="both", - help="Which of the outputs should receive header and comment lines", -) -@click.option( - "--c1", - type=int, - default=_pairsam_format.COL_C1, - help=f"Chrom 1 column; default {_pairsam_format.COL_C1}" - " Only works with '--backend cython'", -) -@click.option( - "--c2", - type=int, - default=_pairsam_format.COL_C2, - help=f"Chrom 2 column; default {_pairsam_format.COL_C2}" - " Only works with '--backend cython'", -) -@click.option( - "--p1", - type=int, - default=_pairsam_format.COL_P1, - help=f"Position 1 column; default {_pairsam_format.COL_P1}" - " Only works with '--backend cython'", -) -@click.option( - "--p2", - type=int, - default=_pairsam_format.COL_P2, - help=f"Position 2 column; default {_pairsam_format.COL_P2}" - " Only works with '--backend cython'", -) -@click.option( - "--s1", - type=int, - default=_pairsam_format.COL_S1, - help=f"Strand 1 column; default {_pairsam_format.COL_S1}" - " Only works with '--backend cython'", -) -@click.option( - "--s2", - type=int, - default=_pairsam_format.COL_S2, - help=f"Strand 2 column; default {_pairsam_format.COL_S2}" - " Only works with '--backend cython'", -) -@click.option( - "--unmapped-chrom", - type=str, - default=_pairsam_format.UNMAPPED_CHROM, - help="Placeholder for a chromosome on an unmapped side; default {}".format( - _pairsam_format.UNMAPPED_CHROM - ), -) -@common_io_options -def dedup( - pairs_path, - output, - output_dups, - output_unmapped, - output_stats, - output_bytile_dups_stats, - chunksize, - carryover, - max_mismatch, - method, - sep, - comment_char, - send_header_to, - c1, - c2, - p1, - p2, - s1, - s2, - unmapped_chrom, - mark_dups, - extra_col_pair, - keep_parent_id, - backend, - n_proc, - **kwargs, -): - """Find and remove PCR/optical duplicates. - - Find PCR duplicates in an upper-triangular flipped sorted pairs/pairsam - file. Allow for a +/-N bp mismatch at each side of duplicated molecules. - - PAIRS_PATH : input triu-flipped sorted .pairs or .pairsam file. If the - path ends with .gz/.lz4, the input is decompressed by bgzip/lz4c. - By default, the input is read from stdin. - """ - - dedup_py( - pairs_path, - output, - output_dups, - output_unmapped, - output_stats, - output_bytile_dups_stats, - chunksize, - carryover, - max_mismatch, - method, - sep, - comment_char, - send_header_to, - c1, - c2, - p1, - p2, - s1, - s2, - unmapped_chrom, - mark_dups, - extra_col_pair, - keep_parent_id, - backend, - n_proc, - **kwargs, - ) - - -if __name__ == "__main__": - dedup() - - -def dedup_py( - pairs_path, - output, - output_dups, - output_unmapped, - output_stats, - output_bytile_dups_stats, - chunksize, - carryover, - max_mismatch, - method, - sep, - comment_char, - send_header_to, - c1, - c2, - p1, - p2, - s1, - s2, - unmapped_chrom, - mark_dups, - extra_col_pair, - keep_parent_id, - backend, - n_proc, - **kwargs, -): - - sep = ast.literal_eval('"""' + sep + '"""') - send_header_to_dedup = send_header_to in ["both", "dedup"] - send_header_to_dup = send_header_to in ["both", "dups"] - - instream = ( - _fileio.auto_open( - pairs_path, - mode="r", - nproc=kwargs.get("nproc_in"), - command=kwargs.get("cmd_in", None), - ) - if pairs_path - else sys.stdin - ) - outstream = ( - _fileio.auto_open( - output, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - if output - else sys.stdout - ) - out_stats_stream = ( - _fileio.auto_open( - output_stats, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - if output_stats - else None - ) - - out_bytile_dups_stats_stream = ( - _fileio.auto_open( - output_bytile_dups_stats, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - if output_bytile_dups_stats - else None - ) - - # generate empty PairCounter if stats output is requested: - out_stat = PairCounter() if output_stats else None - - if not output_dups: - outstream_dups = None - elif output_dups == "-" or ( - pathlib.Path(output_dups).absolute() == pathlib.Path(output).absolute() - ): - outstream_dups = outstream - else: - outstream_dups = _fileio.auto_open( - output_dups, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - - if not output_unmapped: - outstream_unmapped = None - elif output_unmapped == "-" or ( - pathlib.Path(output_unmapped).absolute() == pathlib.Path(output).absolute() - ): - outstream_unmapped = outstream - elif ( - pathlib.Path(output_unmapped).absolute() == pathlib.Path(output_dups).absolute() - ): - outstream_unmapped = outstream_dups - else: - outstream_unmapped = _fileio.auto_open( - output_unmapped, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - - header, body_stream = _headerops.get_header(instream) - header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME) - if send_header_to_dedup: - outstream.writelines((l + "\n" for l in header)) - if send_header_to_dup and outstream_dups and (outstream_dups != outstream): - dups_header = header - if keep_parent_id and len(dups_header) > 0: - dups_header = _headerops.append_columns(dups_header, ["parent_readID"]) - outstream_dups.writelines((l + "\n" for l in dups_header)) - if ( - outstream_unmapped - and (outstream_unmapped != outstream) - and (outstream_unmapped != outstream_dups) - ): - outstream_unmapped.writelines((l + "\n" for l in header)) - - column_names = _headerops.extract_column_names(header) - extra_cols1 = [] - extra_cols2 = [] - if extra_col_pair is not None: - for col1, col2 in extra_col_pair: - extra_cols1.append(column_names[col1] if col1.isdigit() else col1) - extra_cols2.append(column_names[col2] if col2.isdigit() else col2) - - if backend == "cython": - # warnings.warn( - # "'cython' backend is deprecated and provided only" - # " for backwards compatibility", - # DeprecationWarning, - # ) - extra_cols1 = [column_names.index(col) for col in extra_cols1] - extra_cols2 = [column_names.index(col) for col in extra_cols2] - streaming_dedup_cython( - method, - max_mismatch, - sep, - c1, - c2, - p1, - p2, - s1, - s2, - extra_cols1, - extra_cols2, - unmapped_chrom, - body_stream, - outstream, - outstream_dups, - outstream_unmapped, - out_stat, - mark_dups, - keep_parent_id, - ) - elif backend in ("scipy", "sklearn"): - streaming_dedup( - in_stream=instream, - colnames=column_names, - chunksize=chunksize, - carryover=carryover, - method=method, - mark_dups=mark_dups, - max_mismatch=max_mismatch, - extra_col_pairs=list(extra_col_pair), - keep_parent_id=keep_parent_id, - unmapped_chrom=unmapped_chrom, - comment_char=comment_char, - outstream=outstream, - outstream_dups=outstream_dups, - outstream_unmapped=outstream_unmapped, - out_stat=out_stat, - backend=backend, - n_proc=n_proc, - ) - else: - raise ValueError("Unknown backend") - - # save statistics to a file if it was requested: - if out_stat: - out_stat.save(out_stats_stream) - - if output_bytile_dups_stats: - out_stat.save_bytile_dups(out_bytile_dups_stats_stream) - - if instream != sys.stdin: - instream.close() - - if outstream != sys.stdout: - outstream.close() - - if outstream_dups and (outstream_dups != outstream): - outstream_dups.close() - - if ( - outstream_unmapped - and (outstream_unmapped != outstream) - and (outstream_unmapped != outstream_dups) - ): - outstream_unmapped.close() - - if out_stats_stream: - out_stats_stream.close() - - -### Chunked dedup ### -def streaming_dedup( - in_stream, - colnames, - chunksize, - carryover, - method, - mark_dups, - max_mismatch, - extra_col_pairs, - unmapped_chrom, - comment_char, - outstream, - outstream_dups, - outstream_unmapped, - keep_parent_id, - out_stat, - backend, - n_proc, -): - - deduped_chunks = _dedup_stream( - in_stream=in_stream, - colnames=colnames, - method=method, - chunksize=chunksize, - carryover=carryover, - mark_dups=mark_dups, - max_mismatch=max_mismatch, - extra_col_pairs=extra_col_pairs, - keep_parent_id=keep_parent_id, - comment_char=comment_char, - backend=backend, - n_proc=n_proc, - ) - - for df_chunk in deduped_chunks: - # Write the stats if requested: - if out_stat is not None: - out_stat.add_pairs_from_dataframe( - df_chunk, - unmapped_chrom=unmapped_chrom, - analyse_bytile_dups=keep_parent_id, - ) - - # Define masks of unmapped and duplicated reads: - mask_mapped = np.logical_and( - (df_chunk["chrom1"] != unmapped_chrom), - (df_chunk["chrom2"] != unmapped_chrom), - ) - mask_duplicates = df_chunk["duplicate"] - - # Clean up dataframe: - df_chunk = df_chunk.drop(columns=["duplicate"]) - - # Stream the dups: - if outstream_dups: - df_chunk.loc[mask_mapped & mask_duplicates, :].to_csv( - outstream_dups, index=False, header=False, sep="\t" - ) - - # Drop readID if it was created (not needed for nodup and unmapped pairs): - if keep_parent_id: - df_chunk = df_chunk.drop(columns=["parent_readID"]) - - # Stream unmapped: - if outstream_unmapped: - df_chunk.loc[~mask_mapped, :].to_csv( - outstream_unmapped, index=False, header=False, sep="\t" - ) - - # Stream unique pairs: - df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv( - outstream, index=False, header=False, sep="\t" - ) - - -def _dedup_stream( - in_stream, - colnames, - method, - chunksize, - carryover, - mark_dups, - max_mismatch, - extra_col_pairs, - keep_parent_id, - comment_char, - backend, - n_proc, -): - # Stream the input dataframe: - dfs = pd.read_table( - in_stream, comment=comment_char, names=colnames, chunksize=chunksize - ) - - # Set up the carryover dataframe: - df_prev_nodups = pd.DataFrame([]) - prev_i = 0 - - # Iterate over chunks: - for df in dfs: - df_marked = _dedup_chunk( - pd.concat([df_prev_nodups, df], axis=0, ignore_index=True).reset_index( - drop=True - ), - r=max_mismatch, - method=method, - keep_parent_id=keep_parent_id, - extra_col_pairs=extra_col_pairs, - backend=backend, - n_proc=n_proc, - ) - df_marked = df_marked.loc[prev_i:, :].reset_index(drop=True) - mask_duplicated = df_marked["duplicate"] - if mark_dups: - df_marked.loc[mask_duplicated, "pair_type"] = "DD" - - # Filter out duplicates and store specific columns: - df_nodups = df_marked.loc[~mask_duplicated, colnames] - - # Re-define carryover pairs: - df_prev_nodups = df_nodups.tail(carryover).reset_index(drop=True) - prev_i = len(df_prev_nodups) - - yield df_marked - - -def _dedup_chunk( - df, - r, - method, - keep_parent_id, - extra_col_pairs, - backend, - unmapped_chrom="!", - n_proc=1, -): - """Mark duplicates in a dataframe of pairs - - Parameters - ---------- - df : pd.DataFrame - Dataframe with pairs, has to contain columns 'chrom1', 'pos1', 'chrom2', 'pos2' - 'strand1', 'strand2' - r : int - Allowed distance between two pairs to call them duplicates - method : str - 'sum' or 'max' - whether 'r' uses sum of distances on two ends of pairs, or the - maximal distance - keep_parent_id : bool - If True, the read ID of the read that was not labelled as a duplicate from a - group of duplicates is recorded for each read marked as duplicate. - Only possible with non-cython backends - extra_col_pairs : list of tuples - List of extra column pairs that need to match between two reads for them be - considered duplicates (e.g. useful if alleles are annotated) - backend : str - 'scipy', 'sklearn', 'cython' - unmapped_chrom : str, optional - Which character denotes unmapped reads in the chrom1/chrom2 fields, - by default "!" - n_proc : int, optional - How many cores to use, by default 1 - Only works for 'sklearn' backend - - Returns - ------- - pd.DataFrame - Dataframe with marked duplicates (extra boolean field 'duplicate'), and - optionally recorded 'parent_readID' - - """ - if method not in ("max", "sum"): - raise ValueError('Unknown method, only "sum" or "max" allowed') - if backend == "sklearn": - from sklearn import neighbors - - if method == "sum": - p = 1 - else: - p = np.inf - - # Store the index of the dataframe: - index_colname = df.index.name - if index_colname is None: - index_colname = "index" - df = df.reset_index() # Remove the index temporarily - - # Set up columns to store the dedup info: - df.loc[:, "clusterid"] = np.nan - df.loc[:, "duplicate"] = False - - # Split mapped and unmapped reads: - mask_unmapped = (df["chrom1"] == unmapped_chrom) | (df["chrom2"] == unmapped_chrom) - df_unmapped = df.loc[mask_unmapped, :].copy() - df_mapped = df.loc[~mask_unmapped, :].copy() - N_mapped = df_mapped.shape[0] - - # If there are some mapped reads, dedup them: - if N_mapped > 0: - if backend == "sklearn": - a = neighbors.radius_neighbors_graph( - df_mapped[["pos1", "pos2"]], radius=r, p=p, n_jobs=n_proc, - ) - a0, a1 = a.nonzero() - elif backend == "scipy": - z = scipy.spatial.cKDTree(df_mapped[["pos1", "pos2"]]) - a = z.query_pairs(r=r, p=p, output_type="ndarray") - a0 = a[:, 0] - a1 = a[:, 1] - need_to_match = np.array( - [ - ("chrom1", "chrom1"), - ("chrom2", "chrom2"), - ("strand1", "strand1"), - ("strand2", "strand2"), - ] - + extra_col_pairs - ) - nonpos_matches = np.all( - [ - df_mapped.iloc[a0, df_mapped.columns.get_loc(lc)].values - == df_mapped.iloc[a1, df_mapped.columns.get_loc(rc)].values - for (lc, rc) in need_to_match - ], - axis=0, - ) - a0 = a0[nonpos_matches] - a1 = a1[nonpos_matches] - a_mat = coo_matrix((np.ones_like(a0), (a0, a1)), shape=(N_mapped, N_mapped)) - - # Set up inferred clusterIDs: - df_mapped.loc[:, "clusterid"] = connected_components(a_mat, directed=False)[1] - - mask_dups = df_mapped["clusterid"].duplicated() - df_mapped.loc[mask_dups, "duplicate"] = True - - # Mark parent IDs if requested: - if keep_parent_id: - df_mapped.loc[:, "parent_readID"] = df_mapped["clusterid"].map( - df_mapped[~mask_dups].set_index("clusterid")["readID"] - ) - df_unmapped.loc[:, "parent_readID"] = "" - - # Reconstruct original dataframe with removed duplicated reads: - df = pd.concat([df_unmapped, df_mapped]).reset_index(drop=True) - df = df.set_index(index_colname) # Set up the original index - df = df.drop( - ["clusterid"], axis=1 - ) # Remove the information that we don't need anymore: - - return df - - -def streaming_dedup_cython( - method, - max_mismatch, - sep, - c1ind, - c2ind, - p1ind, - p2ind, - s1ind, - s2ind, - extra_cols1, - extra_cols2, - unmapped_chrom, - instream, - outstream, - outstream_dups, - outstream_unmapped, - out_stat, - mark_dups, - keep_parent_id=False, - readid_ind=0, -): - """ - Cython-powered deduplication with online algorithm based on indexed list. - - Parameters - ---------- - method: "max" or "sum" - max_mismatch: maximum allowed mismatch to count the pairs as duplicates - sep: separator of the fields in the input file - c1ind: index of the chr1 column - c2ind: index of the chr2 column - p1ind: index of the pos1 column - p2ind: index of the pos2 column - s1ind: index of the strand1 column - s2ind: index of the strand2 column - extra_cols1: extra columns for left alignment in a pair to add - extra_cols2: extra columns for right alignment in a pair to add - unmapped_chrom: Symbol of the chromosome for the unmapped alignment - instream: input stream of pair file - outstream: output stram of deduplicated pairs - outstream_dups: output stream of duplicates (optionally with added parent_id, see keep_parent_id option) - outstream_unmapped: output stram of unmapped pairs - out_stat: output statistics - mark_dups: if True, will add "DD" as the pair_type - keep_parent_id: if True, additional column "parent_readID will be added to the output, can be useful for optical duplicates search - readid_ind: index of the readID column in the input file - - Returns - ------- - - """ - maxind = max(c1ind, c2ind, p1ind, p2ind, s1ind, s2ind) - if bool(extra_cols1) and bool(extra_cols2): - maxind = max(maxind, max(extra_cols1), max(extra_cols2)) - - all_scols1 = [s1ind] + extra_cols1 - all_scols2 = [s2ind] + extra_cols2 - - # if we do stats in the dedup, we need PAIR_TYPE - # i do not see way around this: - if out_stat: - ptind = _pairsam_format.COL_PTYPE - maxind = max(maxind, ptind) - - dd = _dedup.OnlineDuplicateDetector( - method, max_mismatch, returnData=False, keep_parent_id=keep_parent_id - ) - - c1 = [] - c2 = [] - p1 = [] - p2 = [] - s1 = [] - s2 = [] - idx = [] - line_buffer = [] - cols_buffer = [] - chromDict = {} - strandDict = {} - curMaxLen = max(MAX_LEN, dd.getLen()) - - instream = iter(instream) - read_idx = 0 # read index to mark the parent readID - while True: - rawline = next(instream, None) - stripline = rawline.strip() if rawline else None - - # take care of empty lines not at the end of the file separately - if rawline and (not stripline): - warnings.warn("Empty line detected not at the end of the file") - continue - - if stripline: - cols = stripline.split(sep) - if len(cols) <= maxind: - raise ValueError( - "Error parsing line {}: ".format(stripline) - + " expected {} columns, got {}".format(maxind, len(cols)) - ) - - if (cols[c1ind] == unmapped_chrom) or (cols[c2ind] == unmapped_chrom): - - if outstream_unmapped: - outstream_unmapped.write(stripline) - # don't forget terminal newline - outstream_unmapped.write("\n") - - # add a pair to PairCounter if stats output is requested: - if out_stat: - out_stat.add_pair( - cols[c1ind], - int(cols[p1ind]), - cols[s1ind], - cols[c2ind], - int(cols[p2ind]), - cols[s2ind], - cols[ptind], - ) - else: - line_buffer.append(stripline) - cols_buffer.append(cols) - - c1.append(fetchadd(cols[c1ind], chromDict)) - c2.append(fetchadd(cols[c2ind], chromDict)) - p1.append(int(cols[p1ind])) - p2.append(int(cols[p2ind])) - - idx.append(read_idx) - read_idx += 1 - - if bool(extra_cols1) and bool(extra_cols2): - s1.append( - fetchadd("".join(cols[i] for i in all_scols1), strandDict) - ) - s2.append( - fetchadd("".join(cols[i] for i in all_scols2), strandDict) - ) - else: - s1.append(fetchadd(cols[s1ind], strandDict)) - s2.append(fetchadd(cols[s2ind], strandDict)) - if (not stripline) or (len(c1) == curMaxLen): - if keep_parent_id: - res, parents = dd.push( - ar(c1, 32), - ar(c2, 32), - ar(p1, 32), - ar(p2, 32), - ar(s1, 32), - ar(s2, 32), - ) - - else: - res = dd.push( - ar(c1, 32), - ar(c2, 32), - ar(p1, 32), - ar(p2, 32), - ar(s1, 32), - ar(s2, 32), - ) - - if not stripline: - if keep_parent_id: - res_tmp, parents_tmp = dd.finish() - parents = np.concatenate([parents, parents_tmp]) - - else: - res_tmp = dd.finish() - res = np.concatenate([res, res_tmp]) - - for i in range(len(res)): - # not duplicated pair: - if not res[i]: - outstream.write(line_buffer[i]) - # don't forget terminal newline - outstream.write("\n") - if out_stat: - out_stat.add_pair( - cols_buffer[i][c1ind], - int(cols_buffer[i][p1ind]), - cols_buffer[i][s1ind], - cols_buffer[i][c2ind], - int(cols_buffer[i][p2ind]), - cols_buffer[i][s2ind], - cols_buffer[i][ptind], - ) - # duplicated pair: - else: - if out_stat: - out_stat.add_pair( - cols_buffer[i][c1ind], - int(cols_buffer[i][p1ind]), - cols_buffer[i][s1ind], - cols_buffer[i][c2ind], - int(cols_buffer[i][p2ind]), - cols_buffer[i][s2ind], - "DD", - ) - if outstream_dups: - if mark_dups: # DD-marked pair: - output = sep.join(mark_split_pair_as_dup(cols_buffer[i])) - else: # pair as is: - output = line_buffer[i] - - if keep_parent_id: # Add parentID as the last column: - parent_readID = line_buffer[parents[i]].split(sep)[ - readid_ind - ] - output = sep.join([output, parent_readID]) - - outstream_dups.write(output) - - # don't forget terminal newline - outstream_dups.write("\n") - - # flush buffers and perform necessary checks here: - c1 = [] - c2 = [] - p1 = [] - p2 = [] - s1 = [] - s2 = [] - line_buffer = line_buffer[len(res) :] - cols_buffer = cols_buffer[len(res) :] - if not stripline: - if len(line_buffer) != 0: - raise ValueError( - "{} lines left in the buffer, ".format(len(line_buffer)) - + "should be none;" - + "something went terribly wrong" - ) - break - # process next line ... - - # all lines have been processed at this point. - # streaming_dedup is over. - - -def fetchadd(key, mydict): - key = key.strip() - if key not in mydict: - mydict[key] = len(mydict) - return mydict[key] - - -def ar(mylist, val): - return np.array(mylist, dtype={8: np.int8, 16: np.int16, 32: np.int32}[val]) diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py deleted file mode 100755 index f0dd71cd..00000000 --- a/pairtools/pairtools_stats.py +++ /dev/null @@ -1,835 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import io -import sys -import click -import warnings - -import numpy as np -import pandas as pd -from scipy import special - -from collections.abc import Mapping - -from . import _fileio, _pairsam_format, cli, _headerops, common_io_options - -UTIL_NAME = "pairtools_stats" - - -@cli.command() -@click.argument("input_path", type=str, nargs=-1, required=False) -@click.option("-o", "--output", type=str, default="", help="output stats tsv file.") -@click.option( - "--merge", - is_flag=True, - help="If specified, merge multiple input stats files instead of calculating" - " statistics of a .pairs/.pairsam file. Merging is performed via summation of" - " all overlapping statistics. Non-overlapping statistics are appended to" - " the end of the file.", -) -@click.option( - "--bytile-dups/--no-bytile-dups", - default=False, - help="If enabled, will analyse by-tile duplication statistics to estimate" - " library complexity more accurately." - " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", -) -@click.option( - "--output-bytile-dups-stats", - type=str, - default="", - help="If specified, will analyse by-tile duplication statistics to estimate" - " library complexity more accurately and will save details to this path." - " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", -) -@common_io_options -def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs): - """Calculate pairs statistics. - - INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. - If not provided, the input is read from stdin. - If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number - of stats files to merge. - - The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. - """ - stats_py( - input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs, - ) - - -def stats_py( - input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs -): - if merge: - do_merge(output, input_path, **kwargs) - return - - instream = ( - _fileio.auto_open( - input_path[0], - mode="r", - nproc=kwargs.get("nproc_in"), - command=kwargs.get("cmd_in", None), - ) - if input_path - else sys.stdin - ) - outstream = ( - _fileio.auto_open( - output, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - if output - else sys.stdout - ) - - header, body_stream = _headerops.get_header(instream) - cols = _headerops.extract_column_names(header) - - if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols: - warnings.warn( - "No 'parent_readID' column in the file, not generating duplicate stats." - ) - bytile_dups = False - output_bytile_dups_stats = False - # new stats class stuff would come here ... - stats = PairCounter() - - # Collecting statistics - for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000): - stats.add_pairs_from_dataframe(chunk) - - if output_bytile_dups_stats: - stats.save_bytile_dups() - stats.calculate_summaries() - - # save statistics to file ... - stats.save(outstream) - - if instream != sys.stdin: - instream.close() - if outstream != sys.stdout: - outstream.close() - - -def do_merge(output, files_to_merge, **kwargs): - # Parse all stats files. - stats = [] - for stat_file in files_to_merge: - f = _fileio.auto_open( - stat_file, - mode="r", - nproc=kwargs.get("nproc_in"), - command=kwargs.get("cmd_in", None), - ) - # use a factory method to instanciate PairCounter - stat = PairCounter.from_file(f) - stats.append(stat) - f.close() - - # combine stats from several files (files_to_merge): - out_stat = sum(stats) - - # Save merged stats. - outstream = _fileio.auto_open( - output, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - - # save statistics to file ... - out_stat.save(outstream) - - if outstream != sys.stdout: - outstream.close() - - -def estimate_library_complexity(nseq, ndup, nopticaldup=0): - """Estimate library complexity accounting for optical/clustering duplicates - - Parameters - ---------- - nseq : int - Total number of sequences - ndup : int - Total number of duplicates - nopticaldup : int, optional - Number of non-PCR duplicates, by default 0 - - Returns - ------- - float - Estimated complexity - """ - nseq = nseq - nopticaldup - if nseq == 0: - warnings.warn("Empty of fully duplicated library, can't estimate complexity") - return 0 - ndup = ndup - nopticaldup - u = (nseq - ndup) / nseq - seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u - complexity = nseq / seq_to_complexity - return complexity - - -def extract_tile_info(series, regex=False): - """Extract the name of the tile for each read name in the series - - Parameters - ---------- - series : pd.Series - Series containing read IDs - regex : bool, optional - Regex to extract fields from the read IDs that correspond to tile IDs. - By default False, uses a faster predefined approach for typical Illumina - read names - Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)" - - Returns - ------- - Series - Series containing tile IDs as strings - """ - if regex: - split = series.str.extractall(regex).unstack().droplevel(1, axis=1) - return split[0] + ":" + split[1] + ":" + split[2] - else: - split = series.str.split(":", expand=True) - return split[2] + ":" + split[3] + ":" + split[4] - - -def analyse_duplicate_stats(dups, tile_dup_regex=False): - """Count by-tile duplicates - - Parameters - ---------- - dups : pd.DataFrame - Dataframe with duplicates that contains pared read IDs - tile_dup_regex : bool, optional - See extract_tile_info for details, by default False - - Returns - ------- - pd.DataFrame - Grouped multi-indexed dataframe of pairwise by-tile duplication counts - """ - dups = dups.copy() - dups["tile"] = extract_tile_info(dups["readID"]) - dups["parent_tile"] = extract_tile_info(dups["parent_readID"]) - dups["same_tile"] = dups["tile"] == dups["parent_tile"] - bytile_dups = ( - dups.groupby(["tile", "parent_tile"]) - .size() - .reset_index(name="dup_count") - .sort_values(["tile", "parent_tile"]) - ) - bytile_dups[["tile", "parent_tile"]] = np.sort( - bytile_dups[["tile", "parent_tile"]].values, axis=1 - ) - bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum() - return bytile_dups - - -class PairCounter(Mapping): - """ - A Counter for Hi-C pairs that accumulates various statistics. - - PairCounter implements two interfaces to access multi-level statistics: - 1. as a nested dict, e.g. pairCounter['pair_types']['LL'] - 2. as a flat dict, with the level keys separated by '/', e.g. pairCounter['pair_types/LL'] - - Other features: - -- PairCounters can be saved into/loaded from a file - -- multiple PairCounters can be merged via addition. - """ - - _SEP = "\t" - _KEY_SEP = "/" - - def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25): - self._stat = {} - # some variables used for initialization: - # genomic distance bining for the ++/--/-+/+- distribution - self._dist_bins = np.r_[ - 0, - np.round( - 10 - ** np.arange( - min_log10_dist, max_log10_dist + 0.001, log10_dist_bin_step - ) - ).astype(np.int), - ] - - # establish structure of an empty _stat: - self._stat["total"] = 0 - self._stat["total_unmapped"] = 0 - self._stat["total_single_sided_mapped"] = 0 - # total_mapped = total_dups + total_nodups - self._stat["total_mapped"] = 0 - self._stat["total_dups"] = 0 - self._stat["total_nodups"] = 0 - ######################################## - # the rest of stats are based on nodups: - ######################################## - self._stat["cis"] = 0 - self._stat["trans"] = 0 - self._stat["pair_types"] = {} - # to be removed: - self._stat["dedup"] = {} - - self._stat["cis_1kb+"] = 0 - self._stat["cis_2kb+"] = 0 - self._stat["cis_4kb+"] = 0 - self._stat["cis_10kb+"] = 0 - self._stat["cis_20kb+"] = 0 - self._stat["cis_40kb+"] = 0 - - self._stat["chrom_freq"] = {} - - self._stat["dist_freq"] = { - "+-": np.zeros(len(self._dist_bins), dtype=np.int), - "-+": np.zeros(len(self._dist_bins), dtype=np.int), - "--": np.zeros(len(self._dist_bins), dtype=np.int), - "++": np.zeros(len(self._dist_bins), dtype=np.int), - } - # Summaries are derived from other stats and are recalculated on merge - self._stat["summary"] = dict( - [ - ("frac_cis", 0), - ("frac_cis_1kb+", 0), - ("frac_cis_2kb+", 0), - ("frac_cis_4kb+", 0), - ("frac_cis_10kb+", 0), - ("frac_cis_20kb+", 0), - ("frac_cis_40kb+", 0), - ("frac_dups", 0), - ("complexity_naive", 0), - ] - ) - self.bytile_dups = pd.DataFrame( - index=pd.MultiIndex( - levels=[[], []], codes=[[], []], names=["tile", "parent_tile"] - ) - ) - - def __getitem__(self, key): - if isinstance(key, str): - # let's strip any unintentional '/' - # from either side of the key - key = key.strip("/") - if self._KEY_SEP in key: - # multi-key to access nested elements - k_fields = key.split(self._KEY_SEP) - else: - # single-key access flat part of PairCounter - # or to access highest level of hierarchy - return self._stat[key] - else: - # clearly an error: - raise ValueError(f"{key} is not a valid key: must be str") - - # K_FIELDS: - # process multi-key case: - # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup'] - # get the first 'k' and keep the remainders in 'k_fields' - k = k_fields.pop(0) - if k in ["pair_types", "dedup"]: - # assert there is only one element in key_fields left: - # 'pair_types' and 'dedup' treated the same - if len(k_fields) == 1: - return self._stat[k][k_fields[0]] - else: - raise ValueError( - f"{key} is not a valid key: {k} section implies 1 identifier" - ) - - elif k == "chrom_freq": - # assert remaining key_fields == [chr1, chr2]: - if len(k_fields) == 2: - return self._stat[k][tuple(k_fields)] - else: - raise ValueError( - f"{key} is not a valid key: {k} section implies 2 identifiers" - ) - - elif k == "dist_freq": - # assert that last element of key_fields is the 'directions' - # THIS IS DONE FOR CONSISTENCY WITH .stats FILE - # SHOULD THAT BE CHANGED IN .stats AND HERE AS WELL? - if len(k_fields) == 2: - # assert 'dirs' in ['++','--','+-','-+'] - dirs = k_fields.pop() - # there is only genomic distance range of the bin that's left: - (bin_range,) = k_fields - # extract left border of the bin "1000000+" or "1500-6000": - dist_bin_left = ( - bin_range.strip("+") - if bin_range.endswith("+") - else bin_range.split("-")[0] - ) - # get the index of that bin: - bin_idx = ( - np.searchsorted(self._dist_bins, int(dist_bin_left), "right") - 1 - ) - # store corresponding value: - return self._stat["dist_freq"][dirs][bin_idx] - else: - raise ValueError( - f"{key} is not a valid key: {k} section implies 2 identifiers" - ) - else: - raise ValueError(f"{k} is not a valid key") - - def __iter__(self): - return iter(self._stat) - - def __len__(self): - return len(self._stat) - - def calculate_summaries(self): - """calculate summary statistics (fraction of cis pairs at different cutoffs, - complexity estimate) based on accumulated counts. Results are saved into - self._stat['summary'] - - """ - for cis_count in ( - "cis", - "cis_1kb+", - "cis_2kb+", - "cis_4kb+", - "cis_10kb+", - "cis_20kb+", - "cis_40kb+", - ): - self._stat["summary"][f"frac_{cis_count}"] = ( - (self._stat[cis_count] / self._stat["total_nodups"]) - if self._stat["total_nodups"] > 0 - else 0 - ) - self._stat["summary"]["frac_dups"] = ( - (self._stat["total_dups"] / self._stat["total_mapped"]) - if self._stat["total_mapped"] > 0 - else 0 - ) - self._stat["summary"]["complexity_naive"] = estimate_library_complexity( - self._stat["total_mapped"], self._stat["total_dups"], 0 - ) - if self.bytile_dups.shape[0] > 0: - self._stat["dups_by_tile_median"] = ( - self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0] - ) - if "dups_by_tile_median" in self._stat: - self._stat["summary"][ - "complexity_dups_by_tile_median" - ] = estimate_library_complexity( - self._stat["total_mapped"], - self._stat["total_dups"], - self._stat["dups_by_tile_median"], - ) - - @classmethod - def from_file(cls, file_handle): - """create instance of PairCounter from file - - Parameters - ---------- - file_handle: file handle - - Returns - ------- - PairCounter - new PairCounter filled with the contents of the input file - """ - # fill in from file - file_handle: - stat_from_file = cls() - for l in file_handle: - fields = l.strip().split(cls._SEP) - if len(fields) == 0: - # skip empty lines: - continue - if len(fields) != 2: - # expect two _SEP separated values per line: - raise _fileio.ParseError( - "{} is not a valid stats file".format(file_handle.name) - ) - # extract key and value, then split the key: - putative_key, putative_val = fields[0], fields[1] - key_fields = putative_key.split(cls._KEY_SEP) - # we should impose a rigid structure of .stats or redo it: - if len(key_fields) == 1: - key = key_fields[0] - if key in stat_from_file._stat: - stat_from_file._stat[key] = int(fields[1]) - else: - raise _fileio.ParseError( - "{} is not a valid stats file: unknown field {} detected".format( - file_handle.name, key - ) - ) - else: - # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup', 'summary'] - # get the first 'key' and keep the remainders in 'key_fields' - key = key_fields.pop(0) - if key in ["pair_types", "dedup", "summary"]: - # assert there is only one element in key_fields left: - # 'pair_types' and 'dedup' treated the same - if len(key_fields) == 1: - try: - stat_from_file._stat[key][key_fields[0]] = int(fields[1]) - except ValueError: - stat_from_file._stat[key][key_fields[0]] = float(fields[1]) - else: - raise _fileio.ParseError( - "{} is not a valid stats file: {} section implies 1 identifier".format( - file_handle.name, key - ) - ) - - elif key == "chrom_freq": - # assert remaining key_fields == [chr1, chr2]: - if len(key_fields) == 2: - stat_from_file._stat[key][tuple(key_fields)] = int(fields[1]) - else: - raise _fileio.ParseError( - "{} is not a valid stats file: {} section implies 2 identifiers".format( - file_handle.name, key - ) - ) - - elif key == "dist_freq": - # assert that last element of key_fields is the 'directions' - if len(key_fields) == 2: - # assert 'dirs' in ['++','--','+-','-+'] - dirs = key_fields.pop() - # there is only genomic distance range of the bin that's left: - (bin_range,) = key_fields - # extract left border of the bin "1000000+" or "1500-6000": - dist_bin_left = ( - bin_range.strip("+") - if bin_range.endswith("+") - else bin_range.split("-")[0] - ) - # get the index of that bin: - bin_idx = ( - np.searchsorted( - stat_from_file._dist_bins, int(dist_bin_left), "right" - ) - - 1 - ) - # store corresponding value: - stat_from_file._stat[key][dirs][bin_idx] = int(fields[1]) - else: - raise _fileio.ParseError( - "{} is not a valid stats file: {} section implies 2 identifiers".format( - file_handle.name, key - ) - ) - else: - raise _fileio.ParseError( - "{} is not a valid stats file: unknown field {} detected".format( - file_handle.name, key - ) - ) - # return PairCounter from a non-empty dict: - return stat_from_file - - def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): - """Gather statistics for a Hi-C pair and add to the PairCounter. - - Parameters - ---------- - chrom1: str - chromosome of the first read - pos1: int - position of the first read - strand1: str - strand of the first read - chrom2: str - chromosome of the first read - pos2: int - position of the first read - strand2: str - strand of the first read - pair_type: str - type of the mapped pair of reads - """ - - self._stat["total"] += 1 - # collect pair type stats including DD: - self._stat["pair_types"][pair_type] = ( - self._stat["pair_types"].get(pair_type, 0) + 1 - ) - if chrom1 == "!" and chrom2 == "!": - self._stat["total_unmapped"] += 1 - elif chrom1 != "!" and chrom2 != "!": - self._stat["total_mapped"] += 1 - # only mapped ones can be duplicates: - if pair_type == "DD": - self._stat["total_dups"] += 1 - else: - self._stat["total_nodups"] += 1 - self._stat["chrom_freq"][(chrom1, chrom2)] = ( - self._stat["chrom_freq"].get((chrom1, chrom2), 0) + 1 - ) - - if chrom1 == chrom2: - self._stat["cis"] += 1 - dist = np.abs(pos2 - pos1) - bin_idx = np.searchsorted(self._dist_bins, dist, "right") - 1 - self._stat["dist_freq"][strand1 + strand2][bin_idx] += 1 - if dist >= 1000: - self._stat["cis_1kb+"] += 1 - if dist >= 2000: - self._stat["cis_2kb+"] += 1 - if dist >= 4000: - self._stat["cis_4kb+"] += 1 - if dist >= 10000: - self._stat["cis_10kb+"] += 1 - if dist >= 20000: - self._stat["cis_20kb+"] += 1 - if dist >= 40000: - self._stat["cis_40kb+"] += 1 - - else: - self._stat["trans"] += 1 - else: - self._stat["total_single_sided_mapped"] += 1 - - def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False): - """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. - - Parameters - ---------- - df: pd.DataFrame - DataFrame with pairs. Needs to have columns: - 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type' - """ - - total_count = df.shape[0] - self._stat["total"] += total_count - - # collect pair type stats including DD: - for pair_type, type_count in df["pair_type"].value_counts().items(): - self._stat["pair_types"][pair_type] = ( - self._stat["pair_types"].get(pair_type, 0) + type_count - ) - - # Count the unmapped by the "unmapped" chromosomes (debatable, as WW are also marked as ! and they might be mapped): - unmapped_count = np.logical_and( - df["chrom1"] == unmapped_chrom, df["chrom2"] == unmapped_chrom - ).sum() - self._stat["total_unmapped"] += int(unmapped_count) - - # Count the mapped: - df_mapped = df.loc[ - (df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), : - ] - mapped_count = df_mapped.shape[0] - - self._stat["total_mapped"] += mapped_count - self._stat["total_single_sided_mapped"] += int( - total_count - (mapped_count + unmapped_count) - ) - - # Count the duplicates: - if "duplicate" in df_mapped.columns: - mask_dups = df_mapped["duplicate"] - else: - mask_dups = df_mapped["pair_type"] == "DD" - dups = df_mapped[mask_dups] - dups_count = dups.shape[0] - self._stat["total_dups"] += int(dups_count) - self._stat["total_nodups"] += int(mapped_count - dups_count) - - df_nodups = df_mapped.loc[~mask_dups, :] - mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"] - df_cis = df_nodups.loc[mask_cis, :].copy() - - # Count pairs per chromosome: - for (chrom1, chrom2), chrom_count in ( - df_nodups[["chrom1", "chrom2"]].value_counts().items() - ): - self._stat["chrom_freq"][(chrom1, chrom2)] = ( - self._stat["chrom_freq"].get((chrom1, chrom2), 0) + chrom_count - ) - - # Count cis-trans by pairs: - - self._stat["cis"] += df_cis.shape[0] - self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0] - dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values) - - df_cis.loc[:, "bin_idx"] = np.searchsorted(self._dist_bins, dist, "right") - 1 - for (strand1, strand2, bin_id), strand_bin_count in ( - df_cis[["strand1", "strand2", "bin_idx"]].value_counts().items() - ): - self._stat["dist_freq"][strand1 + strand2][bin_id] += strand_bin_count - self._stat["cis_1kb+"] += int(np.sum(dist >= 1000)) - self._stat["cis_2kb+"] += int(np.sum(dist >= 2000)) - self._stat["cis_4kb+"] += int(np.sum(dist >= 4000)) - self._stat["cis_10kb+"] += int(np.sum(dist >= 10000)) - self._stat["cis_20kb+"] += int(np.sum(dist >= 20000)) - self._stat["cis_40kb+"] += int(np.sum(dist >= 40000)) - - ### Add by-tile dups - if bytile_dups and dups.shape[0] > 0: - self.bytile_dups = self.bytile_dups.add( - analyse_duplicate_stats(dups), fill_value=0 - ).astype(int) - - def __add__(self, other): - # both PairCounter are implied to have a list of common fields: - # - # 'total', 'total_unmapped', 'total_single_sided_mapped', 'total_mapped', - # 'cis', 'trans', 'pair_types', 'cis_1kb+', 'cis_2kb+', - # 'cis_10kb+', 'cis_20kb+', 'chrom_freq', 'dist_freq', 'dedup' - # - # initialize empty PairCounter for the result of summation: - sum_stat = PairCounter() - # use the empty PairCounter to iterate over: - for k, v in sum_stat._stat.items(): - # not nested fields are summed trivially: - if isinstance(v, int): - sum_stat._stat[k] = self._stat[k] + other._stat[k] - # sum nested dicts/arrays in a context dependet manner: - else: - if k in ["pair_types", "dedup"]: - # handy function for summation of a pair of dicts: - # https://stackoverflow.com/questions/10461531/merge-and-sum-of-two-dictionaries - sum_dicts = lambda dict_x, dict_y: { - key: dict_x.get(key, 0) + dict_y.get(key, 0) - for key in set(dict_x) | set(dict_y) - } - # sum a pair of corresponding dicts: - sum_stat._stat[k] = sum_dicts(self._stat[k], other._stat[k]) - if k == "chrom_freq": - # union list of keys (chr1,chr2) with potential duplicates: - union_keys_with_dups = list(self._stat[k].keys()) + list( - other._stat[k].keys() - ) - # dict.fromkeys will take care of keys' order and duplicates in a consistent manner: - # https://stackoverflow.com/questions/1720421/how-to-concatenate-two-lists-in-python - # last comment to the 3rd Answer - sum_stat._stat[k] = dict.fromkeys(union_keys_with_dups) - # perform a summation: - for union_key in sum_stat._stat[k]: - sum_stat._stat[k][union_key] = self._stat[k].get( - union_key, 0 - ) + other._stat[k].get(union_key, 0) - if k == "dist_freq": - for dirs in sum_stat[k]: - sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs] - sum_stat.calculate_summaries() - sum_stat["summary"]["complexity_naive"] = ( - self._stat["summary"]["complexity_naive"] - + other._stat["summary"]["complexity_naive"] - ) - if ( - "complexity_dups_by_tile_median" in self._stat - and "complexity_dups_by_tile_median" in other._stat - ): - sum_stat["summary"]["complexity_dups_by_tile_median"] = ( - self._stat["summary"]["complexity_dups_by_tile_median"] - + other._stat["summary"]["complexity_dups_by_tile_median"] - ) - # self.bytile_dups.add(other.bytile_dups, fill_value=0).astype(int) - return sum_stat - - # we need this to be able to sum(list_of_PairCounters) - def __radd__(self, other): - if other == 0: - return self - else: - return self.__add__(other) - - def flatten(self): - - """return a flattened dict (formatted same way as .stats file) - - """ - # dict for flat store: - flat_stat = {} - - # Storing statistics - for k, v in self._stat.items(): - if isinstance(v, int): - flat_stat[k] = v - # store nested dicts/arrays in a context dependet manner: - # nested categories are stored only if they are non-trivial - else: - if (k == "dist_freq") and v: - for i in range(len(self._dist_bins)): - for dirs, freqs in v.items(): - # last bin is treated differently: "100000+" vs "1200-3000": - if i != len(self._dist_bins) - 1: - formatted_key = self._KEY_SEP.join( - ["{}", "{}-{}", "{}"] - ).format( - k, self._dist_bins[i], self._dist_bins[i + 1], dirs - ) - else: - formatted_key = self._KEY_SEP.join( - ["{}", "{}+", "{}"] - ).format(k, self._dist_bins[i], dirs) - # store key,value pair: - flat_stat[formatted_key] = freqs[i] - elif (k in ["pair_types", "dedup", "summary"]) and v: - # 'pair_types', 'dedup' and 'summary' are simple dicts inside, - - # treat them the exact same way: - for k_item, freq in v.items(): - formatted_key = self._KEY_SEP.join(["{}", "{}"]).format( - k, k_item - ) - # store key,value pair: - flat_stat[formatted_key] = freq - elif (k == "chrom_freq") and v: - for (chrom1, chrom2), freq in v.items(): - formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format( - k, chrom1, chrom2 - ) - # store key,value pair: - flat_stat[formatted_key] = freq - - # return flattened dict - return flat_stat - - def save(self, outstream): - """save PairCounter to tab-delimited text file. - Flattened version of PairCounter is stored in the file. - - Parameters - ---------- - outstream: file handle - - - Note - ---- - The order of the keys is not guaranteed - Merging several .stats is not associative with respect to key order: - merge(A,merge(B,C)) != merge(merge(A,B),C). - - Theys shou5ld match exactly, however, when soprted: - sort(merge(A,merge(B,C))) == sort(merge(merge(A,B),C)) - """ - - # write flattened version of the PairCounter to outstream - for k, v in self.flatten().items(): - outstream.write("{}{}{}\n".format(k, self._SEP, v)) - - def save_bytile_dups(self, outstream): - """save bytile duplication counts to a tab-delimited text file. - - Parameters - ---------- - outstream: file handle - """ - self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False) - - -if __name__ == "__main__": - stats() From 034b8d0d8c71be4ced4ea8f9c8e60f1f61c7d336 Mon Sep 17 00:00:00 2001 From: Ilya Flyamer Date: Wed, 27 Apr 2022 15:17:56 +0200 Subject: [PATCH 12/20] Test always --- .github/workflows/python-package.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 54fffe4b..da3f924f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -4,10 +4,8 @@ name: Python package on: - push: - branches: [ master ] - pull_request: - branches: [ master ] + push + pull_request jobs: build: From bc647ba43b31c76dc337b136a07babf830db49e4 Mon Sep 17 00:00:00 2001 From: Ilya Flyamer Date: Wed, 27 Apr 2022 15:22:07 +0200 Subject: [PATCH 13/20] fix testing --- .github/workflows/python-package.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index da3f924f..92d47fde 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -3,9 +3,7 @@ name: Python package -on: - push - pull_request +on: [push, pull_request] jobs: build: From c356d7432bad3d04b3433b036419ccc440e5ac65 Mon Sep 17 00:00:00 2001 From: Ilya Flyamer Date: Wed, 27 Apr 2022 15:22:58 +0200 Subject: [PATCH 14/20] Avoid double testing in PRs --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 92d47fde..cb1e04f9 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -3,7 +3,7 @@ name: Python package -on: [push, pull_request] +on: push jobs: build: From 1654e8c2066c732a37e89dc690701d3817420c9d Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 27 Apr 2022 15:28:56 +0200 Subject: [PATCH 15/20] Fix stats --- pairtools/lib/stats.py | 120 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 2b9ab373..5e0e128f 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -37,6 +37,86 @@ def do_merge(output, files_to_merge, **kwargs): outstream.close() +def estimate_library_complexity(nseq, ndup, nopticaldup=0): + """Estimate library complexity accounting for optical/clustering duplicates + Parameters + ---------- + nseq : int + Total number of sequences + ndup : int + Total number of duplicates + nopticaldup : int, optional + Number of non-PCR duplicates, by default 0 + Returns + ------- + float + Estimated complexity + """ + nseq = nseq - nopticaldup + if nseq == 0: + warnings.warn("Empty of fully duplicated library, can't estimate complexity") + return 0 + ndup = ndup - nopticaldup + u = (nseq - ndup) / nseq + seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u + complexity = nseq / seq_to_complexity + return complexity + + +def extract_tile_info(series, regex=False): + """Extract the name of the tile for each read name in the series + Parameters + ---------- + series : pd.Series + Series containing read IDs + regex : bool, optional + Regex to extract fields from the read IDs that correspond to tile IDs. + By default False, uses a faster predefined approach for typical Illumina + read names + Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)" + Returns + ------- + Series + Series containing tile IDs as strings + """ + if regex: + split = series.str.extractall(regex).unstack().droplevel(1, axis=1) + return split[0] + ":" + split[1] + ":" + split[2] + else: + split = series.str.split(":", expand=True) + return split[2] + ":" + split[3] + ":" + split[4] + + +def analyse_duplicate_stats(dups, tile_dup_regex=False): + """Count by-tile duplicates + Parameters + ---------- + dups : pd.DataFrame + Dataframe with duplicates that contains pared read IDs + tile_dup_regex : bool, optional + See extract_tile_info for details, by default False + Returns + ------- + pd.DataFrame + Grouped multi-indexed dataframe of pairwise by-tile duplication counts + """ + dups = dups.copy() + dups["tile"] = extract_tile_info(dups["readID"]) + dups["parent_tile"] = extract_tile_info(dups["parent_readID"]) + dups["same_tile"] = dups["tile"] == dups["parent_tile"] + bytile_dups = ( + dups.groupby(["tile", "parent_tile"]) + .size() + .reset_index(name="dup_count") + .sort_values(["tile", "parent_tile"]) + ) + bytile_dups[["tile", "parent_tile"]] = np.sort( + bytile_dups[["tile", "parent_tile"]].values, axis=1 + ) + bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum() + return bytile_dups + + class PairCounter(Mapping): """ A Counter for Hi-C pairs that accumulates various statistics. @@ -180,6 +260,46 @@ def __iter__(self): def __len__(self): return len(self._stat) + def calculate_summaries(self): + """calculate summary statistics (fraction of cis pairs at different cutoffs, + complexity estimate) based on accumulated counts. Results are saved into + self._stat['summary'] + """ + for cis_count in ( + "cis", + "cis_1kb+", + "cis_2kb+", + "cis_4kb+", + "cis_10kb+", + "cis_20kb+", + "cis_40kb+", + ): + self._stat["summary"][f"frac_{cis_count}"] = ( + (self._stat[cis_count] / self._stat["total_nodups"]) + if self._stat["total_nodups"] > 0 + else 0 + ) + self._stat["summary"]["frac_dups"] = ( + (self._stat["total_dups"] / self._stat["total_mapped"]) + if self._stat["total_mapped"] > 0 + else 0 + ) + self._stat["summary"]["complexity_naive"] = estimate_library_complexity( + self._stat["total_mapped"], self._stat["total_dups"], 0 + ) + if self.bytile_dups.shape[0] > 0: + self._stat["dups_by_tile_median"] = ( + self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0] + ) + if "dups_by_tile_median" in self._stat: + self._stat["summary"][ + "complexity_dups_by_tile_median" + ] = estimate_library_complexity( + self._stat["total_mapped"], + self._stat["total_dups"], + self._stat["dups_by_tile_median"], + ) + @classmethod def from_file(cls, file_handle): """create instance of PairCounter from file From 7cd0c649061fdf028303672111ebbaf4d1a932a4 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 27 Apr 2022 15:33:09 +0200 Subject: [PATCH 16/20] Add missing pieces in stats --- pairtools/lib/stats.py | 52 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 5e0e128f..aed95fd5 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from collections.abc import Mapping import sys from . import fileio @@ -180,6 +181,26 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25) "++": np.zeros(len(self._dist_bins), dtype=np.int), } + # Summaries are derived from other stats and are recalculated on merge + self._stat["summary"] = dict( + [ + ("frac_cis", 0), + ("frac_cis_1kb+", 0), + ("frac_cis_2kb+", 0), + ("frac_cis_4kb+", 0), + ("frac_cis_10kb+", 0), + ("frac_cis_20kb+", 0), + ("frac_cis_40kb+", 0), + ("frac_dups", 0), + ("complexity_naive", 0), + ] + ) + self.bytile_dups = pd.DataFrame( + index=pd.MultiIndex( + levels=[[], []], codes=[[], []], names=["tile", "parent_tile"] + ) + ) + def __getitem__(self, key): if isinstance(key, str): # let's strip any unintentional '/' @@ -465,9 +486,9 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): else: self._stat["total_single_sided_mapped"] += 1 - def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): + def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. - + Parameters ---------- df: pd.DataFrame @@ -506,22 +527,25 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): mask_dups = df_mapped["duplicate"] else: mask_dups = df_mapped["pair_type"] == "DD" - dups_count = mask_dups.sum() + dups = df_mapped[mask_dups] + dups_count = dups.shape[0] self._stat["total_dups"] += int(dups_count) self._stat["total_nodups"] += int(mapped_count - dups_count) + df_nodups = df_mapped.loc[~mask_dups, :] + mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"] + df_cis = df_nodups.loc[mask_cis, :].copy() + # Count pairs per chromosome: for (chrom1, chrom2), chrom_count in ( - df_mapped[["chrom1", "chrom2"]].value_counts().items() + df_nodups[["chrom1", "chrom2"]].value_counts().items() ): self._stat["chrom_freq"][(chrom1, chrom2)] = ( self._stat["chrom_freq"].get((chrom1, chrom2), 0) + chrom_count ) # Count cis-trans by pairs: - df_nodups = df_mapped.loc[~mask_dups, :] - mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"] - df_cis = df_nodups.loc[mask_cis, :].copy() + self._stat["cis"] += df_cis.shape[0] self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0] dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values) @@ -538,6 +562,12 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): self._stat["cis_20kb+"] += int(np.sum(dist >= 20000)) self._stat["cis_40kb+"] += int(np.sum(dist >= 40000)) + ### Add by-tile dups + if bytile_dups and dups.shape[0] > 0: + self.bytile_dups = self.bytile_dups.add( + analyse_duplicate_stats(dups), fill_value=0 + ).astype(int) + def add_chromsizes(self, chromsizes): """ Add chromsizes field to the output stats @@ -728,3 +758,11 @@ def save(self, outstream, yaml=False): else: for k, v in self.flatten().items(): outstream.write("{}{}{}\n".format(k, self._SEP, v)) + + def save_bytile_dups(self, outstream): + """save bytile duplication counts to a tab-delimited text file. + Parameters + ---------- + outstream: file handle + """ + self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False) From 09cfab0ba0805a96dce31f6d4f09085a2c3d98b2 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 27 Apr 2022 15:34:46 +0200 Subject: [PATCH 17/20] Fix missing imports --- pairtools/lib/stats.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index aed95fd5..97badf09 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -1,5 +1,8 @@ +import warnings + import numpy as np import pandas as pd +from scipy import special from collections.abc import Mapping import sys from . import fileio From 6154e286648f271e2e928cecadb463a2ae6287cd Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 27 Apr 2022 16:47:51 +0200 Subject: [PATCH 18/20] FIx order in stats test --- pairtools/lib/stats.py | 13 ++++++++----- tests/data/mock.4stats.pairs | 10 +++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 97badf09..653d2673 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -289,6 +289,13 @@ def calculate_summaries(self): complexity estimate) based on accumulated counts. Results are saved into self._stat['summary'] """ + + self._stat["summary"]["frac_dups"] = ( + (self._stat["total_dups"] / self._stat["total_mapped"]) + if self._stat["total_mapped"] > 0 + else 0 + ) + for cis_count in ( "cis", "cis_1kb+", @@ -303,11 +310,7 @@ def calculate_summaries(self): if self._stat["total_nodups"] > 0 else 0 ) - self._stat["summary"]["frac_dups"] = ( - (self._stat["total_dups"] / self._stat["total_mapped"]) - if self._stat["total_mapped"] > 0 - else 0 - ) + self._stat["summary"]["complexity_naive"] = estimate_library_complexity( self._stat["total_mapped"], self._stat["total_dups"], 0 ) diff --git a/tests/data/mock.4stats.pairs b/tests/data/mock.4stats.pairs index 4d6bd7fa..73c46673 100644 --- a/tests/data/mock.4stats.pairs +++ b/tests/data/mock.4stats.pairs @@ -10,11 +10,11 @@ #chromsize: chr3 100 #chromsize: chr1 100 #columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type -readid01 chr1 1 chr2 20 + + UU -readid02 chr1 1 chr1 50 + + UU -readid03 chr1 1 chr1 50 + + DD -readid04 chr1 1 chr1 2 + + UU -readid05 chr1 1 chr1 3 + + UR +readid01 chr1 1 chr1 50 + + UU +readid02 chr1 1 chr1 50 + + DD +readid03 chr1 1 chr1 2 + + UU +readid04 chr1 1 chr1 3 + + UR +readid05 chr1 1 chr2 20 + + UU readid06 chr2 1 chr3 2 + + UU readid07 ! 0 chr1 3 - + NU readid08 ! 0 chr1 3 - + MU From 7face1d3be43ea9b7a26588e4a86c20bab79340e Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Wed, 27 Apr 2022 14:09:40 -0400 Subject: [PATCH 19/20] important fixes; tests are back. --- pairtools/cli/dedup.py | 42 +++--- pairtools/cli/stats.py | 43 +++--- pairtools/lib/dedup.py | 3 +- pairtools/lib/stats.py | 311 +++++++++++++++++++++++------------------ 4 files changed, 224 insertions(+), 175 deletions(-) diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py index 61d68572..74bfe29a 100644 --- a/pairtools/cli/dedup.py +++ b/pairtools/cli/dedup.py @@ -5,8 +5,8 @@ import ast import pathlib -# from distutils.log import warn -# import warnings +from .._logging import get_logger +logger = get_logger() from ..lib import fileio, pairsam_format, headerops from . import cli, common_io_options @@ -61,13 +61,14 @@ " By default, statistics are not printed.", ) @click.option( - "--output-bytile-dups-stats", + "--output-bytile-stats", type=str, default="", help="output file for duplicate statistics." " If file exists, it will be open in the append mode." " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." - " By default, by-tile duplicate statistics are not printed.", + " By default, by-tile duplicate statistics are not printed." + " Note that the readID should be provided and contain tile information for this option. ", ) ### Set the dedup method: @click.option( @@ -231,7 +232,7 @@ def dedup( output_dups, output_unmapped, output_stats, - output_bytile_dups_stats, + output_bytile_stats, chunksize, carryover, max_mismatch, @@ -269,7 +270,7 @@ def dedup( output_dups, output_unmapped, output_stats, - output_bytile_dups_stats, + output_bytile_stats, chunksize, carryover, max_mismatch, @@ -303,7 +304,7 @@ def dedup_py( output_dups, output_unmapped, output_stats, - output_bytile_dups_stats, + output_bytile_stats, chunksize, carryover, max_mismatch, @@ -361,18 +362,21 @@ def dedup_py( else None ) - out_bytile_dups_stats_stream = ( - fileio.auto_open( - output_bytile_dups_stats, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), + bytile_dups = False + if output_bytile_stats: + out_bytile_stats_stream = fileio.auto_open( + output_bytile_stats, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), ) - if output_bytile_dups_stats - else None - ) + bytile_dups = True + if not keep_parent_id: + logger.warning("Force output --parent-readID because --output-bytile-stats provided.") + keep_parent_id = True + # generate empty PairCounter if stats output is requested: - out_stat = PairCounter() if output_stats else None + out_stat = PairCounter(bytile_dups=bytile_dups) if output_stats else None if not output_dups: outstream_dups = None @@ -486,8 +490,8 @@ def dedup_py( if out_stat: out_stat.save(out_stats_stream) - if output_bytile_dups_stats: - out_stat.save_bytile_dups(out_bytile_dups_stats_stream) + if bytile_dups: + out_stat.save_bytile_dups(out_bytile_stats_stream) if instream != sys.stdin: instream.close() diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py index a8888687..ff7b5904 100644 --- a/pairtools/cli/stats.py +++ b/pairtools/cli/stats.py @@ -3,7 +3,6 @@ import io import sys import click -import warnings import pandas as pd from ..lib import fileio, pairsam_format, headerops @@ -11,6 +10,9 @@ from ..lib.stats import PairCounter, do_merge +from .._logging import get_logger +logger = get_logger() + UTIL_NAME = "pairtools_stats" @@ -45,18 +47,21 @@ default=False, help="If enabled, will analyse by-tile duplication statistics to estimate" " library complexity more accurately." - " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", + " Requires parent_readID column to be saved by dedup (will be ignored otherwise)" + " Saves by-tile stats into --output_bytile-stats stream, or regular output if --output_bytile-stats is not provided.", ) @click.option( - "--output-bytile-dups-stats", - type=str, + "--output-bytile-stats", default="", - help="If specified, will analyse by-tile duplication statistics to estimate" - " library complexity more accurately and will save details to this path." - " Requires parent_readID column to be saved by dedup (will be ignored otherwise)", + required=False, + help="output file for tile duplicate statistics." + " If file exists, it will be open in the append mode." + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " By default, by-tile duplicate statistics are not printed." + " Note that the readID and parent_readID should be provided and contain tile information for this option.", ) @common_io_options -def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs): +def stats(input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs): """Calculate pairs statistics. INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. @@ -68,12 +73,12 @@ def stats(input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kw """ stats_py( - input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs, + input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs, ) def stats_py( - input_path, output, merge, bytile_dups, output_bytile_dups_stats, **kwargs + input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs ): if merge: do_merge(output, input_path, **kwargs) @@ -94,18 +99,23 @@ def stats_py( nproc=kwargs.get("nproc_out"), command=kwargs.get("cmd_out", None), ) + if bytile_dups and not output_bytile_stats: + output_bytile_stats = outstream + if output_bytile_stats: + bytile_dups = True header, body_stream = headerops.get_header(instream) cols = headerops.extract_column_names(header) - if (bytile_dups or output_bytile_dups_stats) and "parent_readID" not in cols: - warnings.warn( + # Check necessary columns for reporting bytile stats: + if bytile_dups and "parent_readID" not in cols: + logger.warning( "No 'parent_readID' column in the file, not generating duplicate stats." ) bytile_dups = False - output_bytile_dups_stats = False + # new stats class stuff would come here ... - stats = PairCounter() + stats = PairCounter(bytile_dups=bytile_dups) # Collecting statistics for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000): @@ -115,8 +125,9 @@ def stats_py( chromsizes = headerops.extract_chromsizes(header) stats.add_chromsizes(chromsizes) - if output_bytile_dups_stats: - stats.save_bytile_dups() + if bytile_dups: + stats.save_bytile_dups(output_bytile_stats) + stats.calculate_summaries() # save statistics to file ... diff --git a/pairtools/lib/dedup.py b/pairtools/lib/dedup.py index 29cccbbd..a7aac0a9 100644 --- a/pairtools/lib/dedup.py +++ b/pairtools/lib/dedup.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -import warnings import scipy.spatial from scipy.sparse import coo_matrix @@ -380,7 +379,7 @@ def streaming_dedup_cython( # take care of empty lines not at the end of the file separately if rawline and (not stripline): - warnings.warn("Empty line detected not at the end of the file") + logger.warning("Empty line detected not at the end of the file") continue if stripline: diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 653d2673..9e822f43 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np import pandas as pd from scipy import special @@ -7,118 +5,8 @@ import sys from . import fileio - -def do_merge(output, files_to_merge, **kwargs): - # Parse all stats files. - stats = [] - for stat_file in files_to_merge: - f = fileio.auto_open( - stat_file, - mode="r", - nproc=kwargs.get("nproc_in"), - command=kwargs.get("cmd_in", None), - ) - # use a factory method to instanciate PairCounter - stat = PairCounter.from_file(f) - stats.append(stat) - f.close() - - # combine stats from several files (files_to_merge): - out_stat = sum(stats) - - # Save merged stats. - outstream = fileio.auto_open( - output, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - - # save statistics to file ... - out_stat.save(outstream) - - if outstream != sys.stdout: - outstream.close() - - -def estimate_library_complexity(nseq, ndup, nopticaldup=0): - """Estimate library complexity accounting for optical/clustering duplicates - Parameters - ---------- - nseq : int - Total number of sequences - ndup : int - Total number of duplicates - nopticaldup : int, optional - Number of non-PCR duplicates, by default 0 - Returns - ------- - float - Estimated complexity - """ - nseq = nseq - nopticaldup - if nseq == 0: - warnings.warn("Empty of fully duplicated library, can't estimate complexity") - return 0 - ndup = ndup - nopticaldup - u = (nseq - ndup) / nseq - seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u - complexity = nseq / seq_to_complexity - return complexity - - -def extract_tile_info(series, regex=False): - """Extract the name of the tile for each read name in the series - Parameters - ---------- - series : pd.Series - Series containing read IDs - regex : bool, optional - Regex to extract fields from the read IDs that correspond to tile IDs. - By default False, uses a faster predefined approach for typical Illumina - read names - Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)" - Returns - ------- - Series - Series containing tile IDs as strings - """ - if regex: - split = series.str.extractall(regex).unstack().droplevel(1, axis=1) - return split[0] + ":" + split[1] + ":" + split[2] - else: - split = series.str.split(":", expand=True) - return split[2] + ":" + split[3] + ":" + split[4] - - -def analyse_duplicate_stats(dups, tile_dup_regex=False): - """Count by-tile duplicates - Parameters - ---------- - dups : pd.DataFrame - Dataframe with duplicates that contains pared read IDs - tile_dup_regex : bool, optional - See extract_tile_info for details, by default False - Returns - ------- - pd.DataFrame - Grouped multi-indexed dataframe of pairwise by-tile duplication counts - """ - dups = dups.copy() - dups["tile"] = extract_tile_info(dups["readID"]) - dups["parent_tile"] = extract_tile_info(dups["parent_readID"]) - dups["same_tile"] = dups["tile"] == dups["parent_tile"] - bytile_dups = ( - dups.groupby(["tile", "parent_tile"]) - .size() - .reset_index(name="dup_count") - .sort_values(["tile", "parent_tile"]) - ) - bytile_dups[["tile", "parent_tile"]] = np.sort( - bytile_dups[["tile", "parent_tile"]].values, axis=1 - ) - bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum() - return bytile_dups +from .._logging import get_logger +logger = get_logger() class PairCounter(Mapping): @@ -137,7 +25,7 @@ class PairCounter(Mapping): _SEP = "\t" _KEY_SEP = "/" - def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25): + def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25, bytile_dups=False): self._stat = {} # some variables used for initialization: # genomic distance bining for the ++/--/-+/+- distribution @@ -198,11 +86,13 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25) ("complexity_naive", 0), ] ) - self.bytile_dups = pd.DataFrame( - index=pd.MultiIndex( - levels=[[], []], codes=[[], []], names=["tile", "parent_tile"] + self._save_bytile_dups = bytile_dups + if self._save_bytile_dups: + self._bytile_dups = pd.DataFrame( + index=pd.MultiIndex( + levels=[[], []], codes=[[], []], names=["tile", "parent_tile"] + ) ) - ) def __getitem__(self, key): if isinstance(key, str): @@ -314,18 +204,21 @@ def calculate_summaries(self): self._stat["summary"]["complexity_naive"] = estimate_library_complexity( self._stat["total_mapped"], self._stat["total_dups"], 0 ) - if self.bytile_dups.shape[0] > 0: - self._stat["dups_by_tile_median"] = ( - self.bytile_dups["dup_count"].median() * self.bytile_dups.shape[0] - ) - if "dups_by_tile_median" in self._stat: - self._stat["summary"][ - "complexity_dups_by_tile_median" - ] = estimate_library_complexity( - self._stat["total_mapped"], - self._stat["total_dups"], - self._stat["dups_by_tile_median"], - ) + + if self._save_bytile_dups: + # Estimate library complexity with information by tile, if provided: + if self._bytile_dups.shape[0] > 0: + self._stat["dups_by_tile_median"] = ( + self._bytile_dups["dup_count"].median() * self._bytile_dups.shape[0] + ) + if "dups_by_tile_median" in self._stat: + self._stat["summary"][ + "complexity_dups_by_tile_median" + ] = estimate_library_complexity( + self._stat["total_mapped"], + self._stat["total_dups"], + self._stat["dups_by_tile_median"], + ) @classmethod def from_file(cls, file_handle): @@ -492,7 +385,7 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): else: self._stat["total_single_sided_mapped"] += 1 - def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False): + def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. Parameters @@ -533,8 +426,8 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False): mask_dups = df_mapped["duplicate"] else: mask_dups = df_mapped["pair_type"] == "DD" - dups = df_mapped[mask_dups] - dups_count = dups.shape[0] + df_dups = df_mapped[mask_dups] + dups_count = df_dups.shape[0] self._stat["total_dups"] += int(dups_count) self._stat["total_nodups"] += int(mapped_count - dups_count) @@ -569,9 +462,10 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!", bytile_dups=False): self._stat["cis_40kb+"] += int(np.sum(dist >= 40000)) ### Add by-tile dups - if bytile_dups and dups.shape[0] > 0: - self.bytile_dups = self.bytile_dups.add( - analyse_duplicate_stats(dups), fill_value=0 + if self._save_bytile_dups and (df_dups.shape[0] > 0): + bytile_dups = analyse_bytile_duplicate_stats(df_dups) + self._bytile_dups = self._bytile_dups.add( + bytile_dups, fill_value=0 ).astype(int) def add_chromsizes(self, chromsizes): @@ -681,6 +575,13 @@ def flatten(self): ) # store key,value pair: flat_stat[formatted_key] = freq + elif (k == "summary") and v: + for key, frac in v.items(): + formatted_key = self._KEY_SEP.join(["{}", "{}"]).format( + k, key + ) + # store key,value pair: + flat_stat[formatted_key] = frac # return flattened dict return flat_stat @@ -732,6 +633,11 @@ def format(self): ] = freq # store key,value pair: formatted_stat[k] = deepcopy(freqs) + elif (k == "summary") and v: + summary_stats = {} + for key, frac in v.items(): + summary_stats[key] = frac + formatted_stat[k] = deepcopy(summary_stats) # return formatted dict return formatted_stat @@ -771,4 +677,133 @@ def save_bytile_dups(self, outstream): ---------- outstream: file handle """ - self.bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False) + if self._save_bytile_dups: + self._bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False) + else: + logger.error("Bytile dups are not calculated, cannot save.") + + +################## +# Other functions: + +def do_merge(output, files_to_merge, **kwargs): + # Parse all stats files. + stats = [] + for stat_file in files_to_merge: + f = fileio.auto_open( + stat_file, + mode="r", + nproc=kwargs.get("nproc_in"), + command=kwargs.get("cmd_in", None), + ) + # use a factory method to instanciate PairCounter + stat = PairCounter.from_file(f) + stats.append(stat) + f.close() + + # combine stats from several files (files_to_merge): + out_stat = sum(stats) + + # Save merged stats. + outstream = fileio.auto_open( + output, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + + # save statistics to file ... + out_stat.save(outstream) + + if outstream != sys.stdout: + outstream.close() + + +def estimate_library_complexity(nseq, ndup, nopticaldup=0): + """Estimate library complexity accounting for optical/clustering duplicates + Parameters + ---------- + nseq : int + Total number of sequences + ndup : int + Total number of duplicates + nopticaldup : int, optional + Number of non-PCR duplicates, by default 0 + Returns + ------- + float + Estimated complexity + """ + nseq = nseq - nopticaldup + if nseq == 0: + logger.warning("Empty of fully duplicated library, can't estimate complexity") + return 0 + ndup = ndup - nopticaldup + u = (nseq - ndup) / nseq + if u==0: + logger.warning("All the sequences are duplicates. Do you run complexity estimation on duplicates file?") + return 0 + seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u + complexity = float(nseq / seq_to_complexity) # clean np.int64 data type + return complexity + + +def analyse_bytile_duplicate_stats(df_dups, tile_dup_regex=False): + """Count by-tile duplicates + Parameters + ---------- + dups : pd.DataFrame + Dataframe with duplicates that contains pared read IDs + tile_dup_regex : bool, optional + See extract_tile_info for details, by default False + Returns + ------- + pd.DataFrame + Grouped multi-indexed dataframe of pairwise by-tile duplication counts + """ + + df_dups = df_dups.copy() + + df_dups["tile"] = extract_tile_info(df_dups["readID"], regex=tile_dup_regex) + df_dups["parent_tile"] = extract_tile_info(df_dups["parent_readID"], regex=tile_dup_regex) + + df_dups["same_tile"] = (df_dups["tile"] == df_dups["parent_tile"]) + bytile_dups = ( + df_dups.groupby(["tile", "parent_tile"]) + .size() + .reset_index(name="dup_count") + .sort_values(["tile", "parent_tile"]) + ) + bytile_dups[["tile", "parent_tile"]] = np.sort( + bytile_dups[["tile", "parent_tile"]].values, axis=1 + ) + bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum() + return bytile_dups + + +def extract_tile_info(series, regex=False): + """Extract the name of the tile for each read name in the series + Parameters + ---------- + series : pd.Series + Series containing read IDs + regex : bool, optional + Regex to extract fields from the read IDs that correspond to tile IDs. + By default False, uses a faster predefined approach for typical Illumina + read names + Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)" + Returns + ------- + Series + Series containing tile IDs as strings + """ + if regex: + split = series.str.extractall(regex).unstack().droplevel(1, axis=1) + if split.shape[1]<4: + raise ValueError(f"Unable to convert tile names, does your readID have the tile information?\nHint: SRA removes tile information from readID.\nSample of your readIDs:\n{series.head()}") + return split[0] + ":" + split[1] + ":" + split[2] + else: + split = series.str.split(":", expand=True) + if split.shape[1]<5: + raise ValueError(f"Unable to convert tile names, does your readID have the tile information?\nHint: SRA removes tile information from readID.\nSample of your readIDs:\n{series.head()}") + return split[2] + ":" + split[3] + ":" + split[4] From c762fd5359453820a4fb89854e4d09c4731fc8eb Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Wed, 27 Apr 2022 14:43:45 -0400 Subject: [PATCH 20/20] compute_summaries is now part of saving the stats, if not done explicitly --- pairtools/cli/stats.py | 4 +--- pairtools/lib/stats.py | 8 +++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py index ff7b5904..890988b0 100644 --- a/pairtools/cli/stats.py +++ b/pairtools/cli/stats.py @@ -107,7 +107,7 @@ def stats_py( header, body_stream = headerops.get_header(instream) cols = headerops.extract_column_names(header) - # Check necessary columns for reporting bytile stats: + # Check necessary columns for reporting by-tile stats: if bytile_dups and "parent_readID" not in cols: logger.warning( "No 'parent_readID' column in the file, not generating duplicate stats." @@ -128,8 +128,6 @@ def stats_py( if bytile_dups: stats.save_bytile_dups(output_bytile_stats) - stats.calculate_summaries() - # save statistics to file ... stats.save(outstream, yaml=kwargs.get("yaml", False)) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 9e822f43..167ce491 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -93,6 +93,7 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25, levels=[[], []], codes=[[], []], names=["tile", "parent_tile"] ) ) + self._summaries_calculated = False def __getitem__(self, key): if isinstance(key, str): @@ -211,7 +212,7 @@ def calculate_summaries(self): self._stat["dups_by_tile_median"] = ( self._bytile_dups["dup_count"].median() * self._bytile_dups.shape[0] ) - if "dups_by_tile_median" in self._stat: + if "dups_by_tile_median" in self._stat.keys(): self._stat["summary"][ "complexity_dups_by_tile_median" ] = estimate_library_complexity( @@ -220,6 +221,8 @@ def calculate_summaries(self): self._stat["dups_by_tile_median"], ) + self._summaries_calculated = True + @classmethod def from_file(cls, file_handle): """create instance of PairCounter from file @@ -661,6 +664,9 @@ def save(self, outstream, yaml=False): sort(merge(A,merge(B,C))) == sort(merge(merge(A,B),C)) """ + if not self._summaries_calculated: + self.calculate_summaries() + # write flattened version of the PairCounter to outstream if yaml: import yaml