Skip to content

Commit

Permalink
Cleanup and bugfixes pre0.4.0:
Browse files Browse the repository at this point in the history
* Removed OrderedDict remnants: #113

* Pairtools dedup fix of cython forgotten line from recent PR:
#107

* header add_columns function

* pairtools restrict: header improvement

* pairtools stats: warning of dataframe slicing fix

* np.genfromtxt warning resolved:
#71
  • Loading branch information
agalitsyna committed Apr 7, 2022
1 parent 7e4712f commit a31f52d
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 32 deletions.
6 changes: 2 additions & 4 deletions pairtools/_dedup.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,6 @@ def mark_duplicates(
high = low + 1 # restart high
continue




if methodid == 0:
extraCondition = (max(abs(p1[low] - p1[high]),
abs(p2[low] - p2[high])) <= max_mismatch)
Expand Down Expand Up @@ -202,11 +199,12 @@ cdef class OnlineDuplicateDetector(object):
self.rm = self.rm[self.low:]
self.high = self.high-self.low
self.N = self.N - self.low
self.low = 0
if self.returnData == 1:
self.low = 0
return ret
if self.keep_parent_id == 1: # Return parent readIDs alongside with duplicates mask:
pastidx = self.parent_idxs[:self.low]
self.low = 0
return pastrm, pastidx
return pastrm

Expand Down
43 changes: 31 additions & 12 deletions pairtools/_headerops.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from collections import OrderedDict, defaultdict
from collections import defaultdict
import sys
import copy
import itertools
Expand All @@ -12,7 +12,8 @@


PAIRS_FORMAT_VERSION = "1.0.0"

SEP_COLS = " "
SEP_CHROMS = " "

def get_header(instream, comment_char="#"):
"""Returns a header from the stream and an the reaminder of the stream
Expand Down Expand Up @@ -92,7 +93,7 @@ def extract_column_names(header):
columns = extract_fields(header, "columns")

if len(columns) != 0:
return columns[0].split(" ")
return columns[0].split(SEP_COLS)
else:
return []

Expand All @@ -103,7 +104,7 @@ def extract_chromsizes(header):
"""

chromsizes_str = extract_fields(header, "chromsize")
chromsizes_str = list(zip(*[s.split(" ") for s in chromsizes_str]))
chromsizes_str = list(zip(*[s.split(SEP_CHROMS) for s in chromsizes_str]))
chromsizes = pd.Series(data=chromsizes_str[1], index=chromsizes_str[0]).astype(
np.int64
)
Expand All @@ -112,10 +113,10 @@ def extract_chromsizes(header):


def get_chromsizes_from_pysam_header(samheader):
"""Convert pysam header to pairtools chromosomes (Ordered dict).
"""Convert pysam header to pairtools chromosomes dict (ordered by Python default since 3.7).
Example of pysam header converted to dict:
OrderedDict([
dict([
('SQ', [{'SN': 'chr1', 'LN': 248956422},
{'SN': 'chr10', 'LN': 133797422},
{'SN': 'chr11', 'LN': 135086622},
Expand All @@ -125,15 +126,15 @@ def get_chromsizes_from_pysam_header(samheader):
"""
SQs = samheader.to_dict()["SQ"]
chromsizes = [(sq["SN"], int(sq["LN"])) for sq in SQs]
return OrderedDict(chromsizes)
return dict(chromsizes)


def get_chrom_order(chroms_file, sam_chroms=None):
"""
Produce an "enumeration" of chromosomes based on the list
of chromosomes
"""
chrom_enum = OrderedDict()
chrom_enum = dict()
i = 1
with open(chroms_file, "rt") as f:
for line in f:
Expand Down Expand Up @@ -176,7 +177,7 @@ def make_standard_pairsheader(
for chrom, length in chromsizes:
header.append("#chromsize: {} {}".format(chrom, length))

header.append("#columns: " + " ".join(columns))
header.append("#columns: " + SEP_COLS.join(columns))

return header

Expand All @@ -188,7 +189,7 @@ def subset_chroms_in_pairsheader(header, chrom_subset):
if line.strip().split()[1] in chrom_subset:
new_header.append(line)
elif line.startswith("#chromosomes:"):
line = " ".join(
line = SEP_CHROMS.join(
["#chromosomes:"]
+ [c for c in line.strip().split()[1:] if c in chrom_subset]
)
Expand Down Expand Up @@ -225,8 +226,8 @@ def mark_header_as_sorted(header):
header.insert(0, "#sorted: chr1-chr2-pos1-pos2")
for i in range(len(header)):
if header[i].startswith("#chromosomes"):
chroms = header[i][12:].strip().split(" ")
header[i] = "#chromosomes: {}".format(" ".join(sorted(chroms)))
chroms = header[i][12:].strip().split(SEP_CHROMS)
header[i] = "#chromosomes: {}".format(SEP_CHROMS.join(sorted(chroms)))
return header


Expand Down Expand Up @@ -609,6 +610,24 @@ def merge_headers(headers, force=False):
return new_header


def append_columns(header, columns):
"""
Appends columns to the header, separated by SEP_COLS
Parameters
----------
header: Previous header
columns: List of column names to append
Returns
-------
Modified header (appended columns to the field "#columns")
"""
for i in range(len(header)):
if header[i].startswith("#columns: "):
header[i] += SEP_COLS + SEP_COLS.join(columns)
return header

# def _guess_genome_assembly(samheader):
# PG = [l for l in samheader if l.startswith('@PG') and '\tID:bwa' in l][0]
# CL = [field for field in PG.split('\t') if field.startswith('CL:')]
Expand Down
6 changes: 4 additions & 2 deletions pairtools/pairtools_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,8 @@ def dedup_py(
outstream.writelines((l + "\n" for l in header))
if send_header_to_dup and outstream_dups and (outstream_dups != outstream):
dups_header = header
if keep_parent_id:
dups_header[-1] += " parent_readID"
if keep_parent_id and len(dups_header)>0:
dups_header= _headerops.append_columns(dups_header, ["parent_readID"])
outstream_dups.writelines((l + "\n" for l in dups_header))
if (
outstream_unmapped
Expand Down Expand Up @@ -911,6 +911,7 @@ def streaming_dedup_cython(
ar(s1, 32),
ar(s2, 32),
)

else:
res = dd.push(
ar(c1, 32),
Expand All @@ -925,6 +926,7 @@ def streaming_dedup_cython(
if keep_parent_id:
res_tmp, parents_tmp = dd.finish()
parents = np.concatenate([parents, parents_tmp])

else:
res_tmp = dd.finish()
res = np.concatenate([res, res_tmp])
Expand Down
1 change: 0 additions & 1 deletion pairtools/pairtools_parse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import OrderedDict
import subprocess
import fileinput
import itertools
Expand Down
12 changes: 10 additions & 2 deletions pairtools/pairtools_restrict.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,17 @@ def restrict_py(pairs_path, frags, output, **kwargs):

header, body_stream = _headerops.get_header(instream)
header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
header = _headerops.append_columns(header,
['rfrag1', 'rfrag_start1', 'rfrag_end1',
'rfrag2', 'rfrag_start2', 'rfrag_end2'])

outstream.writelines((l+'\n' for l in header))
rfrags = np.genfromtxt(
frags, delimiter='\t', comments='#', dtype=None,
frags,
delimiter='\t',
comments='#',
dtype=None,
encoding='ascii',
names=['chrom', 'start', 'end'])


Expand Down Expand Up @@ -92,7 +100,7 @@ def restrict_py(pairs_path, frags, output, **kwargs):


def find_rfrag(rfrags, chrom, pos):
rsites_chrom = rfrags[chrom.encode('ascii')]
rsites_chrom = rfrags[chrom]
idx = min(max(0,rsites_chrom.searchsorted(pos, 'right')-1), len(rsites_chrom)-2)
return idx, rsites_chrom[idx], rsites_chrom[idx+1]

Expand Down
21 changes: 10 additions & 11 deletions pairtools/pairtools_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,9 +452,7 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
self._stat["total_unmapped"] += int(unmapped_count)

# Count the mapped:
df_mapped = df[
(df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom)
]
df_mapped = df.loc[(df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), :]
mapped_count = df_mapped.shape[0]

self._stat["total_mapped"] += mapped_count
Expand All @@ -480,15 +478,16 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
)

# Count cis-trans by pairs:
df_nodups = df_mapped[~mask_dups]
cis = df_nodups[df_nodups["chrom1"] == df_nodups["chrom2"]]
self._stat["cis"] += cis.shape[0]
self._stat["trans"] += df_nodups.shape[0] - cis.shape[0]
dist = np.abs(cis["pos2"].values - cis["pos1"].values)

cis.loc[:, "bin_idx"] = np.searchsorted(self._dist_bins, dist, "right") - 1
df_nodups = df_mapped.loc[~mask_dups, :]
mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"]
df_cis = df_nodups.loc[mask_cis, :].copy()
self._stat["cis"] += df_cis.shape[0]
self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0]
dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values)

df_cis.loc[:, "bin_idx"] = np.searchsorted(self._dist_bins, dist, "right") - 1
for (strand1, strand2, bin_id), strand_bin_count in (
cis[["strand1", "strand2", "bin_idx"]].value_counts().items()
df_cis[["strand1", "strand2", "bin_idx"]].value_counts().items()
):
self._stat["dist_freq"][strand1 + strand2][bin_id] += strand_bin_count
self._stat["cis_1kb+"] += int(np.sum(dist >= 1000))
Expand Down

0 comments on commit a31f52d

Please sign in to comment.