diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 54fffe4b..cb1e04f9 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -3,11 +3,7 @@ name: Python package -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] +on: push jobs: build: diff --git a/pairtools/__init__.py b/pairtools/__init__.py index c534813d..e4aa6788 100644 --- a/pairtools/__init__.py +++ b/pairtools/__init__.py @@ -12,4 +12,4 @@ __version__ = "1.0.0-dev1" -# from . import lib \ No newline at end of file +# from . import lib diff --git a/pairtools/cli/__init__.py b/pairtools/cli/__init__.py index a1d85d81..9f42e44e 100644 --- a/pairtools/cli/__init__.py +++ b/pairtools/cli/__init__.py @@ -142,6 +142,7 @@ def _excepthook(exc_type, value, tb): sys.excepthook = _excepthook + def common_io_options(func): @click.option( "--nproc-in", @@ -200,5 +201,5 @@ def wrapper(*args, **kwargs): sample, filterbycov, header, - scaling + scaling, ) diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py index 55a3348c..74bfe29a 100644 --- a/pairtools/cli/dedup.py +++ b/pairtools/cli/dedup.py @@ -5,8 +5,8 @@ import ast import pathlib -# from distutils.log import warn -# import warnings +from .._logging import get_logger +logger = get_logger() from ..lib import fileio, pairsam_format, headerops from . import cli, common_io_options @@ -60,7 +60,16 @@ " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." " By default, statistics are not printed.", ) - +@click.option( + "--output-bytile-stats", + type=str, + default="", + help="output file for duplicate statistics." + " If file exists, it will be open in the append mode." + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " By default, by-tile duplicate statistics are not printed." + " Note that the readID should be provided and contain tile information for this option. ", +) ### Set the dedup method: @click.option( "--max-mismatch", @@ -223,6 +232,7 @@ def dedup( output_dups, output_unmapped, output_stats, + output_bytile_stats, chunksize, carryover, max_mismatch, @@ -260,6 +270,7 @@ def dedup( output_dups, output_unmapped, output_stats, + output_bytile_stats, chunksize, carryover, max_mismatch, @@ -293,6 +304,7 @@ def dedup_py( output_dups, output_unmapped, output_stats, + output_bytile_stats, chunksize, carryover, max_mismatch, @@ -350,8 +362,21 @@ def dedup_py( else None ) + bytile_dups = False + if output_bytile_stats: + out_bytile_stats_stream = fileio.auto_open( + output_bytile_stats, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + bytile_dups = True + if not keep_parent_id: + logger.warning("Force output --parent-readID because --output-bytile-stats provided.") + keep_parent_id = True + # generate empty PairCounter if stats output is requested: - out_stat = PairCounter() if output_stats else None + out_stat = PairCounter(bytile_dups=bytile_dups) if output_stats else None if not output_dups: outstream_dups = None @@ -465,6 +490,9 @@ def dedup_py( if out_stat: out_stat.save(out_stats_stream) + if bytile_dups: + out_stat.save_bytile_dups(out_bytile_stats_stream) + if instream != sys.stdin: instream.close() diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py index f1162805..890988b0 100644 --- a/pairtools/cli/stats.py +++ b/pairtools/cli/stats.py @@ -10,6 +10,8 @@ from ..lib.stats import PairCounter, do_merge +from .._logging import get_logger +logger = get_logger() UTIL_NAME = "pairtools_stats" @@ -40,8 +42,26 @@ default=False, help="Output stats in yaml format instead of table. ", ) +@click.option( + "--bytile-dups/--no-bytile-dups", + default=False, + help="If enabled, will analyse by-tile duplication statistics to estimate" + " library complexity more accurately." + " Requires parent_readID column to be saved by dedup (will be ignored otherwise)" + " Saves by-tile stats into --output_bytile-stats stream, or regular output if --output_bytile-stats is not provided.", +) +@click.option( + "--output-bytile-stats", + default="", + required=False, + help="output file for tile duplicate statistics." + " If file exists, it will be open in the append mode." + " If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed." + " By default, by-tile duplicate statistics are not printed." + " Note that the readID and parent_readID should be provided and contain tile information for this option.", +) @common_io_options -def stats(input_path, output, merge, **kwargs): +def stats(input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs): """Calculate pairs statistics. INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. @@ -51,10 +71,15 @@ def stats(input_path, output, merge, **kwargs): The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. """ - stats_py(input_path, output, merge, **kwargs) + + stats_py( + input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs, + ) -def stats_py(input_path, output, merge, **kwargs): +def stats_py( + input_path, output, merge, bytile_dups, output_bytile_stats, **kwargs +): if merge: do_merge(output, input_path, **kwargs) return @@ -74,14 +99,25 @@ def stats_py(input_path, output, merge, **kwargs): nproc=kwargs.get("nproc_out"), command=kwargs.get("cmd_out", None), ) + if bytile_dups and not output_bytile_stats: + output_bytile_stats = outstream + if output_bytile_stats: + bytile_dups = True header, body_stream = headerops.get_header(instream) cols = headerops.extract_column_names(header) + # Check necessary columns for reporting by-tile stats: + if bytile_dups and "parent_readID" not in cols: + logger.warning( + "No 'parent_readID' column in the file, not generating duplicate stats." + ) + bytile_dups = False + # new stats class stuff would come here ... - stats = PairCounter() + stats = PairCounter(bytile_dups=bytile_dups) - # collecting statistics + # Collecting statistics for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000): stats.add_pairs_from_dataframe(chunk) @@ -89,6 +125,9 @@ def stats_py(input_path, output, merge, **kwargs): chromsizes = headerops.extract_chromsizes(header) stats.add_chromsizes(chromsizes) + if bytile_dups: + stats.save_bytile_dups(output_bytile_stats) + # save statistics to file ... stats.save(outstream, yaml=kwargs.get("yaml", False)) diff --git a/pairtools/lib/dedup.py b/pairtools/lib/dedup.py index 29cccbbd..a7aac0a9 100644 --- a/pairtools/lib/dedup.py +++ b/pairtools/lib/dedup.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -import warnings import scipy.spatial from scipy.sparse import coo_matrix @@ -380,7 +379,7 @@ def streaming_dedup_cython( # take care of empty lines not at the end of the file separately if rawline and (not stripline): - warnings.warn("Empty line detected not at the end of the file") + logger.warning("Empty line detected not at the end of the file") continue if stripline: diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index c42370c9..167ce491 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -1,39 +1,12 @@ import numpy as np +import pandas as pd +from scipy import special from collections.abc import Mapping import sys from . import fileio -def do_merge(output, files_to_merge, **kwargs): - # Parse all stats files. - stats = [] - for stat_file in files_to_merge: - f = fileio.auto_open( - stat_file, - mode="r", - nproc=kwargs.get("nproc_in"), - command=kwargs.get("cmd_in", None), - ) - # use a factory method to instanciate PairCounter - stat = PairCounter.from_file(f) - stats.append(stat) - f.close() - - # combine stats from several files (files_to_merge): - out_stat = sum(stats) - - # Save merged stats. - outstream = fileio.auto_open( - output, - mode="w", - nproc=kwargs.get("nproc_out"), - command=kwargs.get("cmd_out", None), - ) - - # save statistics to file ... - out_stat.save(outstream) - - if outstream != sys.stdout: - outstream.close() +from .._logging import get_logger +logger = get_logger() class PairCounter(Mapping): @@ -52,7 +25,7 @@ class PairCounter(Mapping): _SEP = "\t" _KEY_SEP = "/" - def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25): + def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25, bytile_dups=False): self._stat = {} # some variables used for initialization: # genomic distance bining for the ++/--/-+/+- distribution @@ -99,6 +72,29 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25) "++": np.zeros(len(self._dist_bins), dtype=np.int), } + # Summaries are derived from other stats and are recalculated on merge + self._stat["summary"] = dict( + [ + ("frac_cis", 0), + ("frac_cis_1kb+", 0), + ("frac_cis_2kb+", 0), + ("frac_cis_4kb+", 0), + ("frac_cis_10kb+", 0), + ("frac_cis_20kb+", 0), + ("frac_cis_40kb+", 0), + ("frac_dups", 0), + ("complexity_naive", 0), + ] + ) + self._save_bytile_dups = bytile_dups + if self._save_bytile_dups: + self._bytile_dups = pd.DataFrame( + index=pd.MultiIndex( + levels=[[], []], codes=[[], []], names=["tile", "parent_tile"] + ) + ) + self._summaries_calculated = False + def __getitem__(self, key): if isinstance(key, str): # let's strip any unintentional '/' @@ -179,6 +175,54 @@ def __iter__(self): def __len__(self): return len(self._stat) + def calculate_summaries(self): + """calculate summary statistics (fraction of cis pairs at different cutoffs, + complexity estimate) based on accumulated counts. Results are saved into + self._stat['summary'] + """ + + self._stat["summary"]["frac_dups"] = ( + (self._stat["total_dups"] / self._stat["total_mapped"]) + if self._stat["total_mapped"] > 0 + else 0 + ) + + for cis_count in ( + "cis", + "cis_1kb+", + "cis_2kb+", + "cis_4kb+", + "cis_10kb+", + "cis_20kb+", + "cis_40kb+", + ): + self._stat["summary"][f"frac_{cis_count}"] = ( + (self._stat[cis_count] / self._stat["total_nodups"]) + if self._stat["total_nodups"] > 0 + else 0 + ) + + self._stat["summary"]["complexity_naive"] = estimate_library_complexity( + self._stat["total_mapped"], self._stat["total_dups"], 0 + ) + + if self._save_bytile_dups: + # Estimate library complexity with information by tile, if provided: + if self._bytile_dups.shape[0] > 0: + self._stat["dups_by_tile_median"] = ( + self._bytile_dups["dup_count"].median() * self._bytile_dups.shape[0] + ) + if "dups_by_tile_median" in self._stat.keys(): + self._stat["summary"][ + "complexity_dups_by_tile_median" + ] = estimate_library_complexity( + self._stat["total_mapped"], + self._stat["total_dups"], + self._stat["dups_by_tile_median"], + ) + + self._summaries_calculated = True + @classmethod def from_file(cls, file_handle): """create instance of PairCounter from file @@ -346,7 +390,7 @@ def add_pair(self, chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type): def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. - + Parameters ---------- df: pd.DataFrame @@ -385,22 +429,25 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): mask_dups = df_mapped["duplicate"] else: mask_dups = df_mapped["pair_type"] == "DD" - dups_count = mask_dups.sum() + df_dups = df_mapped[mask_dups] + dups_count = df_dups.shape[0] self._stat["total_dups"] += int(dups_count) self._stat["total_nodups"] += int(mapped_count - dups_count) + df_nodups = df_mapped.loc[~mask_dups, :] + mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"] + df_cis = df_nodups.loc[mask_cis, :].copy() + # Count pairs per chromosome: for (chrom1, chrom2), chrom_count in ( - df_mapped[["chrom1", "chrom2"]].value_counts().items() + df_nodups[["chrom1", "chrom2"]].value_counts().items() ): self._stat["chrom_freq"][(chrom1, chrom2)] = ( self._stat["chrom_freq"].get((chrom1, chrom2), 0) + chrom_count ) # Count cis-trans by pairs: - df_nodups = df_mapped.loc[~mask_dups, :] - mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"] - df_cis = df_nodups.loc[mask_cis, :].copy() + self._stat["cis"] += df_cis.shape[0] self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0] dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values) @@ -417,6 +464,13 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): self._stat["cis_20kb+"] += int(np.sum(dist >= 20000)) self._stat["cis_40kb+"] += int(np.sum(dist >= 40000)) + ### Add by-tile dups + if self._save_bytile_dups and (df_dups.shape[0] > 0): + bytile_dups = analyse_bytile_duplicate_stats(df_dups) + self._bytile_dups = self._bytile_dups.add( + bytile_dups, fill_value=0 + ).astype(int) + def add_chromsizes(self, chromsizes): """ Add chromsizes field to the output stats @@ -524,6 +578,13 @@ def flatten(self): ) # store key,value pair: flat_stat[formatted_key] = freq + elif (k == "summary") and v: + for key, frac in v.items(): + formatted_key = self._KEY_SEP.join(["{}", "{}"]).format( + k, key + ) + # store key,value pair: + flat_stat[formatted_key] = frac # return flattened dict return flat_stat @@ -551,7 +612,9 @@ def format(self): for dirs, freqs in v.items(): # last bin is treated differently: "100000+" vs "1200-3000": if i != len(self._dist_bins) - 1: - dist = "{}-{}".format(self._dist_bins[i], self._dist_bins[i + 1]) + dist = "{}-{}".format( + self._dist_bins[i], self._dist_bins[i + 1] + ) else: dist = "{}+".format(self._dist_bins[i]) if dist not in freqs_dct.keys(): @@ -568,14 +631,20 @@ def format(self): elif (k == "chrom_freq") and v: freqs = {} for (chrom1, chrom2), freq in v.items(): - freqs[self._KEY_SEP.join(["{}", "{}"]).format(chrom1, chrom2)] = freq + freqs[ + self._KEY_SEP.join(["{}", "{}"]).format(chrom1, chrom2) + ] = freq # store key,value pair: formatted_stat[k] = deepcopy(freqs) + elif (k == "summary") and v: + summary_stats = {} + for key, frac in v.items(): + summary_stats[key] = frac + formatted_stat[k] = deepcopy(summary_stats) # return formatted dict return formatted_stat - def save(self, outstream, yaml=False): """save PairCounter to tab-delimited text file. Flattened version of PairCounter is stored in the file. @@ -595,11 +664,152 @@ def save(self, outstream, yaml=False): sort(merge(A,merge(B,C))) == sort(merge(merge(A,B),C)) """ + if not self._summaries_calculated: + self.calculate_summaries() + # write flattened version of the PairCounter to outstream if yaml: import yaml + data = self.format() yaml.dump(data, outstream, default_flow_style=False) else: for k, v in self.flatten().items(): outstream.write("{}{}{}\n".format(k, self._SEP, v)) + + def save_bytile_dups(self, outstream): + """save bytile duplication counts to a tab-delimited text file. + Parameters + ---------- + outstream: file handle + """ + if self._save_bytile_dups: + self._bytile_dups.reset_index().to_csv(outstream, sep="\t", index=False) + else: + logger.error("Bytile dups are not calculated, cannot save.") + + +################## +# Other functions: + +def do_merge(output, files_to_merge, **kwargs): + # Parse all stats files. + stats = [] + for stat_file in files_to_merge: + f = fileio.auto_open( + stat_file, + mode="r", + nproc=kwargs.get("nproc_in"), + command=kwargs.get("cmd_in", None), + ) + # use a factory method to instanciate PairCounter + stat = PairCounter.from_file(f) + stats.append(stat) + f.close() + + # combine stats from several files (files_to_merge): + out_stat = sum(stats) + + # Save merged stats. + outstream = fileio.auto_open( + output, + mode="w", + nproc=kwargs.get("nproc_out"), + command=kwargs.get("cmd_out", None), + ) + + # save statistics to file ... + out_stat.save(outstream) + + if outstream != sys.stdout: + outstream.close() + + +def estimate_library_complexity(nseq, ndup, nopticaldup=0): + """Estimate library complexity accounting for optical/clustering duplicates + Parameters + ---------- + nseq : int + Total number of sequences + ndup : int + Total number of duplicates + nopticaldup : int, optional + Number of non-PCR duplicates, by default 0 + Returns + ------- + float + Estimated complexity + """ + nseq = nseq - nopticaldup + if nseq == 0: + logger.warning("Empty of fully duplicated library, can't estimate complexity") + return 0 + ndup = ndup - nopticaldup + u = (nseq - ndup) / nseq + if u==0: + logger.warning("All the sequences are duplicates. Do you run complexity estimation on duplicates file?") + return 0 + seq_to_complexity = special.lambertw(-np.exp(-1 / u) / u).real + 1 / u + complexity = float(nseq / seq_to_complexity) # clean np.int64 data type + return complexity + + +def analyse_bytile_duplicate_stats(df_dups, tile_dup_regex=False): + """Count by-tile duplicates + Parameters + ---------- + dups : pd.DataFrame + Dataframe with duplicates that contains pared read IDs + tile_dup_regex : bool, optional + See extract_tile_info for details, by default False + Returns + ------- + pd.DataFrame + Grouped multi-indexed dataframe of pairwise by-tile duplication counts + """ + + df_dups = df_dups.copy() + + df_dups["tile"] = extract_tile_info(df_dups["readID"], regex=tile_dup_regex) + df_dups["parent_tile"] = extract_tile_info(df_dups["parent_readID"], regex=tile_dup_regex) + + df_dups["same_tile"] = (df_dups["tile"] == df_dups["parent_tile"]) + bytile_dups = ( + df_dups.groupby(["tile", "parent_tile"]) + .size() + .reset_index(name="dup_count") + .sort_values(["tile", "parent_tile"]) + ) + bytile_dups[["tile", "parent_tile"]] = np.sort( + bytile_dups[["tile", "parent_tile"]].values, axis=1 + ) + bytile_dups = bytile_dups.groupby(["tile", "parent_tile"]).sum() + return bytile_dups + + +def extract_tile_info(series, regex=False): + """Extract the name of the tile for each read name in the series + Parameters + ---------- + series : pd.Series + Series containing read IDs + regex : bool, optional + Regex to extract fields from the read IDs that correspond to tile IDs. + By default False, uses a faster predefined approach for typical Illumina + read names + Example: r"(?:\w+):(?:\w+):(\w+):(\w+):(\w+):(?:\w+):(?:\w+)" + Returns + ------- + Series + Series containing tile IDs as strings + """ + if regex: + split = series.str.extractall(regex).unstack().droplevel(1, axis=1) + if split.shape[1]<4: + raise ValueError(f"Unable to convert tile names, does your readID have the tile information?\nHint: SRA removes tile information from readID.\nSample of your readIDs:\n{series.head()}") + return split[0] + ":" + split[1] + ":" + split[2] + else: + split = series.str.split(":", expand=True) + if split.shape[1]<5: + raise ValueError(f"Unable to convert tile names, does your readID have the tile information?\nHint: SRA removes tile information from readID.\nSample of your readIDs:\n{series.head()}") + return split[2] + ":" + split[3] + ":" + split[4] diff --git a/tests/data/mock.4stats.pairs b/tests/data/mock.4stats.pairs new file mode 100644 index 00000000..73c46673 --- /dev/null +++ b/tests/data/mock.4stats.pairs @@ -0,0 +1,21 @@ +## pairs format v1.0.0 +#shape: upper triangle +#genome_assembly: unknown +#samheader: @SQ SN:chr1 LN:100 +#samheader: @SQ SN:chr2 LN:100 +#samheader: @SQ SN:chr3 LN:100 +#samheader: @PG ID:bwa PN:bwa VN:0.7.15-r1140 CL:bwa mem -SP /path/ucsc.hg19.fasta.gz /path/1.fastq.gz /path/2.fastq.gz +#chromosomes: chr2 chr3 chr1 +#chromsize: chr2 100 +#chromsize: chr3 100 +#chromsize: chr1 100 +#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type +readid01 chr1 1 chr1 50 + + UU +readid02 chr1 1 chr1 50 + + DD +readid03 chr1 1 chr1 2 + + UU +readid04 chr1 1 chr1 3 + + UR +readid05 chr1 1 chr2 20 + + UU +readid06 chr2 1 chr3 2 + + UU +readid07 ! 0 chr1 3 - + NU +readid08 ! 0 chr1 3 - + MU +readid09 ! 0 ! 0 - - WW diff --git a/tests/test_stats.py b/tests/test_stats.py index 344ef56f..d3d6c985 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -3,12 +3,13 @@ import sys import subprocess from nose.tools import assert_raises +import numpy as np testdir = os.path.dirname(os.path.realpath(__file__)) def test_mock_pairsam(): - mock_pairsam_path = os.path.join(testdir, "data", "mock.pairsam") + mock_pairsam_path = os.path.join(testdir, "data", "mock.4stats.pairs") try: result = subprocess.check_output( ["python", "-m", "pairtools", "stats", mock_pairsam_path], @@ -25,12 +26,16 @@ def test_mock_pairsam(): ) for k in stats: - stats[k] = int(stats[k]) + try: + stats[k] = int(stats[k]) + except ValueError: + stats[k] = float(stats[k]) print(stats) - assert stats["total"] == 8 + assert stats["total"] == 9 assert stats["total_single_sided_mapped"] == 2 - assert stats["total_mapped"] == 5 + assert stats["total_mapped"] == 6 + assert stats["total_dups"] == 1 assert stats["cis"] == 3 assert stats["trans"] == 2 assert stats["pair_types/UU"] == 4 @@ -38,6 +43,7 @@ def test_mock_pairsam(): assert stats["pair_types/WW"] == 1 assert stats["pair_types/UR"] == 1 assert stats["pair_types/MU"] == 1 + assert stats["pair_types/DD"] == 1 assert stats["chrom_freq/chr1/chr2"] == 1 assert stats["chrom_freq/chr1/chr1"] == 3 assert stats["chrom_freq/chr2/chr3"] == 1 @@ -47,7 +53,17 @@ def test_mock_pairsam(): if k.startswith("dist_freq") and k not in ["dist_freq/1-2/++", "dist_freq/2-3/++", "dist_freq/32-56/++"] ) - + assert stats["dist_freq/1-2/++"] == 1 + assert stats["dist_freq/2-3/++"] == 1 + assert stats["dist_freq/32-56/++"] == 1 + assert stats["summary/frac_cis"] == 0.6 + assert stats["summary/frac_cis_1kb+"] == 0 + assert stats["summary/frac_cis_2kb+"] == 0 + assert stats["summary/frac_cis_4kb+"] == 0 + assert stats["summary/frac_cis_10kb+"] == 0 + assert stats["summary/frac_cis_20kb+"] == 0 + assert stats["summary/frac_cis_40kb+"] == 0 + assert np.isclose(stats["summary/frac_dups"], 1 / 6) assert stats["dist_freq/1-2/++"] == 1 assert stats["dist_freq/2-3/++"] == 1 assert stats["dist_freq/32-56/++"] == 1