Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add summaries #105

Merged
merged 23 commits into from
Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 81 additions & 25 deletions pairtools/pairtools_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@
)
@common_io_options
def stats(input_path, output, merge, **kwargs):
"""Calculate pairs statistics.
"""Calculate pairs statistics.

INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
If not provided, the input is read from stdin.
If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number
If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number
of stats files to merge.
The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.

The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
"""
stats_py(input_path, output, merge, **kwargs)

Expand Down Expand Up @@ -72,9 +72,22 @@ def stats_py(input_path, output, merge, **kwargs):
stats = PairCounter()

# Collecting statistics

for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):
stats.add_pairs_from_dataframe(chunk)
for line in body_stream:
cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)

# algn1:
chrom1 = cols[_pairsam_format.COL_C1]
pos1 = int(cols[_pairsam_format.COL_P1])
strand1 = cols[_pairsam_format.COL_S1]
# algn2:
chrom2 = cols[_pairsam_format.COL_C2]
pos2 = int(cols[_pairsam_format.COL_P2])
strand2 = cols[_pairsam_format.COL_S2]
# pair type:
pair_type = cols[_pairsam_format.COL_PTYPE]

stats.add_pair(chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type)
stats.calculate_summaries()

# save statistics to file ...
stats.save(outstream)
Expand Down Expand Up @@ -118,6 +131,13 @@ def do_merge(output, files_to_merge, **kwargs):
outstream.close()


def complexity_estimate(total_mapped, total_dups):
try:
return total_mapped ** 2 / total_dups / 2
except ZeroDivisionError:
return 0


class PairCounter(Mapping):
"""
A Counter for Hi-C pairs that accumulates various statistics.
Expand Down Expand Up @@ -180,6 +200,19 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
"--": np.zeros(len(self._dist_bins), dtype=np.int),
"++": np.zeros(len(self._dist_bins), dtype=np.int),
}
# Summaries are derived from other stats and are recalculated on merge
self._stat["summary"] = dict(
[
("frac_cis", 0),
("frac_cis_1kb+", 0),
("frac_cis_2kb+", 0),
("frac_cis_4kb+", 0),
("frac_cis_10kb+", 0),
("frac_cis_20kb+", 0),
("frac_cis_40kb+", 0),
("complexity", 0),
]
)

def __getitem__(self, key):
if isinstance(key, str):
Expand All @@ -195,7 +228,7 @@ def __getitem__(self, key):
return self._stat[key]
else:
# clearly an error:
raise ValueError("{} is not a valid key: must be str".format(key))
raise ValueError(f"{key} is not a valid key: must be str")

# K_FIELDS:
# process multi-key case:
Expand All @@ -209,9 +242,7 @@ def __getitem__(self, key):
return self._stat[k][k_fields[0]]
else:
raise ValueError(
"{} is not a valid key: {} section implies 1 identifier".format(
key, k
)
f"{key} is not a valid key: {k} section implies 1 identifier"
)

elif k == "chrom_freq":
Expand All @@ -220,9 +251,7 @@ def __getitem__(self, key):
return self._stat[k][tuple(k_fields)]
else:
raise ValueError(
"{} is not a valid key: {} section implies 2 identifiers".format(
key, k
)
f"{key} is not a valid key: {k} section implies 2 identifiers"
)

elif k == "dist_freq":
Expand All @@ -248,19 +277,39 @@ def __getitem__(self, key):
return self._stat["dist_freq"][dirs][bin_idx]
else:
raise ValueError(
"{} is not a valid key: {} section implies 2 identifiers".format(
key, k
)
f"{key} is not a valid key: {k} section implies 2 identifiers"
)
else:
raise ValueError("{} is not a valid key".format(k))
raise ValueError(f"{k} is not a valid key")

def __iter__(self):
return iter(self._stat)

def __len__(self):
return len(self._stat)

def calculate_summaries(self):
"""calculate summary statistics (fraction of cis pairs at different cutoffs,
complexity estimate) based on accumulated counts. Results are saved into
self._stat['summary']

"""
for cis_count in (
"cis",
"cis_1kb+",
"cis_2kb+",
"cis_4kb+",
"cis_10kb+",
"cis_20kb+",
"cis_40kb+",
):
self._stat["summary"][f"frac_{cis_count}"] = (
self._stat[cis_count] / self._stat["total_nodups"]
Phlya marked this conversation as resolved.
Show resolved Hide resolved
)
self._stat["summary"]["complexity"] = complexity_estimate(
self._stat["total_mapped"], self._stat["total_dups"]
)

@classmethod
def from_file(cls, file_handle):
"""create instance of PairCounter from file
Expand Down Expand Up @@ -301,14 +350,17 @@ def from_file(cls, file_handle):
)
)
else:
# in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup']
# in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup', 'summary']
# get the first 'key' and keep the remainders in 'key_fields'
key = key_fields.pop(0)
if key in ["pair_types", "dedup"]:
if key in ["pair_types", "dedup", "summary"]:
# assert there is only one element in key_fields left:
# 'pair_types' and 'dedup' treated the same
if len(key_fields) == 1:
stat_from_file._stat[key][key_fields[0]] = int(fields[1])
try:
stat_from_file._stat[key][key_fields[0]] = int(fields[1])
except ValueError:
stat_from_file._stat[key][key_fields[0]] = float(fields[1])
else:
raise _fileio.ParseError(
"{} is not a valid stats file: {} section implies 1 identifier".format(
Expand Down Expand Up @@ -452,7 +504,9 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
self._stat["total_unmapped"] += int(unmapped_count)

# Count the mapped:
df_mapped = df.loc[(df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), :]
df_mapped = df.loc[
(df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), :
]
mapped_count = df_mapped.shape[0]

self._stat["total_mapped"] += mapped_count
Expand Down Expand Up @@ -539,6 +593,7 @@ def __add__(self, other):
if k == "dist_freq":
for dirs in sum_stat[k]:
sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs]
sum_stat.calculate_summaries()
return sum_stat

# we need this to be able to sum(list_of_PairCounters)
Expand All @@ -549,6 +604,7 @@ def __radd__(self, other):
return self.__add__(other)

def flatten(self):

"""return a flattened dict (formatted same way as .stats file)

"""
Expand Down Expand Up @@ -578,8 +634,9 @@ def flatten(self):
).format(k, self._dist_bins[i], dirs)
# store key,value pair:
flat_stat[formatted_key] = freqs[i]
elif (k in ["pair_types", "dedup"]) and v:
# 'pair_types' and 'dedup' are simple dicts inside,
elif (k in ["pair_types", "dedup", "summary"]) and v:
# 'pair_types', 'dedup' and 'summary' are simple dicts inside,

# treat them the exact same way:
for k_item, freq in v.items():
formatted_key = self._KEY_SEP.join(["{}", "{}"]).format(
Expand Down Expand Up @@ -624,4 +681,3 @@ def save(self, outstream):

if __name__ == "__main__":
stats()

12 changes: 11 additions & 1 deletion tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def test_mock_pairsam():
if not l.startswith('#') and l.strip())

for k in stats:
stats[k] = int(stats[k])
try:
stats[k] = int(stats[k])
except ValueError:
stats[k] = float(stats[k])
print(stats)

assert stats['total'] == 8
Expand All @@ -53,4 +56,11 @@ def test_mock_pairsam():
assert stats['dist_freq/1-2/++'] == 1
assert stats['dist_freq/2-3/++'] == 1
assert stats['dist_freq/32-56/++'] == 1
assert stats['summary/frac_cis'] == 0.6
assert stats['summary/frac_cis_1kb+'] == 0
assert stats['summary/frac_cis_2kb+'] == 0
assert stats['summary/frac_cis_4kb+'] == 0
assert stats['summary/frac_cis_10kb+'] == 0
assert stats['summary/frac_cis_20kb+'] == 0
assert stats['summary/frac_cis_40kb+'] == 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, these are great tests! Is it possible also to test the deduplication stats?
It's not my idea, though, I found it in #5

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's why I didn't notice the recent error in stats with the new dataframe method, because the test file doesn't have an duplicated pairs... SHould be easy to fix