Skip to content

Commit

Permalink
restrict fixes
Browse files Browse the repository at this point in the history
* handle empty chromosomes, resolved
#76

* fixed rfrags indexing and first rfrag omission, resolved
#73

* resolved or deprecated #16

* pairtools restrct tests
  • Loading branch information
agalitsyna committed Apr 7, 2022
1 parent a31f52d commit 5a1cc76
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 14 deletions.
39 changes: 25 additions & 14 deletions pairtools/pairtools_restrict.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import click
import subprocess
import warnings

import numpy as np

Expand All @@ -14,7 +15,7 @@
@cli.command()

@click.argument(
'pairs_path',
'pairs_path',
type=str,
required=False)

Expand All @@ -26,9 +27,9 @@
'(chrom, start, end). Can be generated using cooler digest.')

@click.option(
'-o', "--output",
type=str,
default="",
'-o', "--output",
type=str,
default="",
help='output .pairs/.pairsam file.'
' If the path ends with .gz/.lz4, the output is compressed by bgzip/lz4c.'
' By default, the output is printed into stdout.')
Expand All @@ -40,20 +41,22 @@ def restrict(pairs_path, frags, output, **kwargs):
Identify the restriction fragments that got ligated into a Hi-C molecule.
PAIRS_PATH : input .pairs/.pairsam file. If the path ends with .gz/.lz4, the
Note: rfrags are 0-indexed
PAIRS_PATH : input .pairs/.pairsam file. If the path ends with .gz/.lz4, the
input is decompressed by bgzip/lz4c. By default, the input is read from stdin.
'''
restrict_py(pairs_path, frags, output, **kwargs)

def restrict_py(pairs_path, frags, output, **kwargs):
instream = (_fileio.auto_open(pairs_path, mode='r',
instream = (_fileio.auto_open(pairs_path, mode='r',
nproc=kwargs.get('nproc_in'),
command=kwargs.get('cmd_in', None))
command=kwargs.get('cmd_in', None))
if pairs_path else sys.stdin)

outstream = (_fileio.auto_open(output, mode='w',
outstream = (_fileio.auto_open(output, mode='w',
nproc=kwargs.get('nproc_out'),
command=kwargs.get('cmd_out', None))
command=kwargs.get('cmd_out', None))
if output else sys.stdout)


Expand All @@ -73,15 +76,13 @@ def restrict_py(pairs_path, frags, output, **kwargs):
names=['chrom', 'start', 'end'])


rfrags.sort(order=['chrom', 'start','end'])
rfrags.sort(order=['chrom', 'start', 'end'])
chrom_borders = np.r_[0,
1+np.where(rfrags['chrom'][:-1] != rfrags['chrom'][1:])[0],
rfrags.shape[0]]
rfrags = {rfrags['chrom'][i]:rfrags['end'][i:j] +1
rfrags = { rfrags['chrom'][i] : np.concatenate([[0], rfrags['end'][i:j] + 1])
for i, j in zip(chrom_borders[:-1], chrom_borders[1:])}


for line in body_stream:
cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)
chrom1, pos1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1])
Expand All @@ -100,8 +101,18 @@ def restrict_py(pairs_path, frags, output, **kwargs):


def find_rfrag(rfrags, chrom, pos):
rsites_chrom = rfrags[chrom]
idx = min(max(0,rsites_chrom.searchsorted(pos, 'right')-1), len(rsites_chrom)-2)

# Return empty if chromosome is unmapped:
if chrom==_pairsam_format.UNMAPPED_CHROM:
return _pairsam_format.UNANNOTATED_RFRAG, _pairsam_format.UNMAPPED_POS, _pairsam_format.UNMAPPED_POS

try:
rsites_chrom = rfrags[chrom]
except ValueError as e:
warnings.warn(f"Chomosome {chrom} does not have annotated restriction fragments, return empty.")
return _pairsam_format.UNANNOTATED_RFRAG, _pairsam_format.UNMAPPED_POS, _pairsam_format.UNMAPPED_POS

idx = min( max(0, rsites_chrom.searchsorted(pos, 'right')-1), len(rsites_chrom)-2)
return idx, rsites_chrom[idx], rsites_chrom[idx+1]

if __name__ == '__main__':
Expand Down
5 changes: 5 additions & 0 deletions tests/data/mock.rsites.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
chr1 0 100
chr1 100 500
chr1 500 10000
chr2 0 200
chr2 200 10000
18 changes: 18 additions & 0 deletions tests/data/mock.test-restr.pairs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
## pairs format v1.0.0
#shape: upper triangle
#genome_assembly: unknown
#samheader: @SQ SN:chr1 LN:10000
#samheader: @SQ SN:chr2 LN:10000
#samheader: @PG ID:bwa PN:bwa VN:0.7.15-r1140 CL:bwa mem -SP /path/ucsc.hg19.fasta.gz /path/1.fastq.gz /path/2.fastq.gz
#chromosomes: chr1 chr2
#chromsize: chr1 10000
#chromsize: chr2 10000
#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type rfrag_test1 rfrag_test2
readid01 chr1 1 chr2 20 + + UU 0 0
readid02 chr1 100 chr2 20 - + UU 0 0
readid03 chr1 100 chr2 20 + + UU 0 0
readid04 chr1 499 chr2 20 + + UU 1 0
readid05 chr1 600 chr2 20 + + UU 2 0
readid06 chr1 1 chr2 200 + + UU 0 0
readid07 chr1 1 chr2 500 + + UU 0 1
readid08 chr1 10001 chr2 10001 + + UU 2 1
52 changes: 52 additions & 0 deletions tests/test_restrict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
import os
import sys

from nose.tools import assert_raises

import subprocess

testdir = os.path.dirname(os.path.realpath(__file__))

def test_restrict():
"""Restrict pairs file"""
mock_pairs_path = os.path.join(testdir, "data", "mock.test-restr.pairs")
mock_rfrag_path = os.path.join(testdir, "data", "mock.rsites.bed")
try:
result = subprocess.check_output(
[
"python",
"-m",
"pairtools",
"restrict",
"-f",
mock_rfrag_path,
mock_pairs_path,
],
).decode("ascii")
except subprocess.CalledProcessError as e:
print(e.output)
print(sys.exc_info())
raise e

# check if the header got transferred correctly
true_header = [l.strip() for l in open(mock_pairs_path, "r") if l.startswith("@")]
output_header = [l.strip() for l in result.split("\n") if l.startswith("#")]
for l in true_header:
assert any([l in l2 for l2 in output_header])

# check that the pairs got assigned properly
cols = [x for x in output_header if x.startswith('#columns')][0].split(' ')[1:]

COL_RFRAG1_TRUE = cols.index('rfrag_test1')
COL_RFRAG2_TRUE = cols.index('rfrag_test2')
COL_RFRAG1_OUTPUT = cols.index('rfrag1')
COL_RFRAG2_OUTPUT = cols.index('rfrag2')

for l in result.split("\n"):
if l.startswith("#") or not l:
continue

line = l.split()
assert line[COL_RFRAG1_TRUE] == line[COL_RFRAG1_OUTPUT]
assert line[COL_RFRAG2_TRUE] == line[COL_RFRAG2_OUTPUT]

0 comments on commit 5a1cc76

Please sign in to comment.