restrict fixes

* handle empty chromosomes, resolved #76 * fixed rfrags indexing and first rfrag omission, resolved #73 * resolved or deprecated #16 * pairtools restrct tests
open2c · Apr 7, 2022 · 5a1cc76 · 5a1cc76
1 parent a31f52d
commit 5a1cc76
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 14 deletions.
diff --git a/pairtools/pairtools_restrict.py b/pairtools/pairtools_restrict.py
@@ -4,6 +4,7 @@
 import sys
 import click
 import subprocess
+import warnings
 
 import numpy as np
 
@@ -14,7 +15,7 @@
 @cli.command()
 
 @click.argument(
-    'pairs_path', 
+    'pairs_path',
     type=str,
     required=False)
 
@@ -26,9 +27,9 @@
          '(chrom, start, end). Can be generated using cooler digest.')
 
 @click.option(
-    '-o', "--output", 
-    type=str, 
-    default="", 
+    '-o', "--output",
+    type=str,
+    default="",
     help='output .pairs/.pairsam file.'
         ' If the path ends with .gz/.lz4, the output is compressed by bgzip/lz4c.'
         ' By default, the output is printed into stdout.')
@@ -40,20 +41,22 @@ def restrict(pairs_path, frags, output, **kwargs):
 
     Identify the restriction fragments that got ligated into a Hi-C molecule.
 
-    PAIRS_PATH : input .pairs/.pairsam file. If the path ends with .gz/.lz4, the 
+    Note: rfrags are 0-indexed
+
+    PAIRS_PATH : input .pairs/.pairsam file. If the path ends with .gz/.lz4, the
     input is decompressed by bgzip/lz4c. By default, the input is read from stdin.
     '''
     restrict_py(pairs_path, frags, output, **kwargs)
 
 def restrict_py(pairs_path, frags, output, **kwargs):
-    instream = (_fileio.auto_open(pairs_path, mode='r', 
+    instream = (_fileio.auto_open(pairs_path, mode='r',
                                   nproc=kwargs.get('nproc_in'),
-                                  command=kwargs.get('cmd_in', None)) 
+                                  command=kwargs.get('cmd_in', None))
                 if pairs_path else sys.stdin)
 
-    outstream = (_fileio.auto_open(output, mode='w', 
+    outstream = (_fileio.auto_open(output, mode='w',
                                    nproc=kwargs.get('nproc_out'),
-                                   command=kwargs.get('cmd_out', None)) 
+                                   command=kwargs.get('cmd_out', None))
                  if output else sys.stdout)
 
 
@@ -73,15 +76,13 @@ def restrict_py(pairs_path, frags, output, **kwargs):
         names=['chrom', 'start', 'end'])
 
 
-    rfrags.sort(order=['chrom', 'start','end'])
     rfrags.sort(order=['chrom', 'start', 'end'])
     chrom_borders = np.r_[0,
                           1+np.where(rfrags['chrom'][:-1] != rfrags['chrom'][1:])[0],
                           rfrags.shape[0]]
-    rfrags = {rfrags['chrom'][i]:rfrags['end'][i:j] +1
+    rfrags = { rfrags['chrom'][i] : np.concatenate([[0], rfrags['end'][i:j] + 1])
               for i, j in zip(chrom_borders[:-1], chrom_borders[1:])}
 
-
     for line in body_stream:
         cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)
         chrom1, pos1 = cols[_pairsam_format.COL_C1], int(cols[_pairsam_format.COL_P1])
@@ -100,8 +101,18 @@ def restrict_py(pairs_path, frags, output, **kwargs):
 
 
 def find_rfrag(rfrags, chrom, pos):
-    rsites_chrom = rfrags[chrom]
-    idx = min(max(0,rsites_chrom.searchsorted(pos, 'right')-1), len(rsites_chrom)-2)
+
+    # Return empty if chromosome is unmapped:
+    if chrom==_pairsam_format.UNMAPPED_CHROM:
+        return _pairsam_format.UNANNOTATED_RFRAG, _pairsam_format.UNMAPPED_POS, _pairsam_format.UNMAPPED_POS
+
+    try:
+        rsites_chrom = rfrags[chrom]
+    except ValueError as e:
+        warnings.warn(f"Chomosome {chrom} does not have annotated restriction fragments, return empty.")
+        return _pairsam_format.UNANNOTATED_RFRAG, _pairsam_format.UNMAPPED_POS, _pairsam_format.UNMAPPED_POS
+
+    idx = min( max(0, rsites_chrom.searchsorted(pos, 'right')-1), len(rsites_chrom)-2)
     return idx, rsites_chrom[idx], rsites_chrom[idx+1]
 
 if __name__ == '__main__':

diff --git a/tests/data/mock.rsites.bed b/tests/data/mock.rsites.bed
@@ -0,0 +1,5 @@
+chr1	0	100
+chr1	100	500
+chr1	500	10000
+chr2	0	200
+chr2	200	10000
diff --git a/tests/data/mock.test-restr.pairs b/tests/data/mock.test-restr.pairs
@@ -0,0 +1,18 @@
+## pairs format v1.0.0
+#shape: upper triangle
+#genome_assembly: unknown
+#samheader: @SQ SN:chr1 LN:10000
+#samheader: @SQ SN:chr2 LN:10000
+#samheader: @PG ID:bwa  PN:bwa  VN:0.7.15-r1140 CL:bwa mem -SP /path/ucsc.hg19.fasta.gz /path/1.fastq.gz /path/2.fastq.gz
+#chromosomes: chr1 chr2
+#chromsize: chr1 10000
+#chromsize: chr2 10000
+#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type rfrag_test1 rfrag_test2
+readid01	chr1	1	chr2	20	+	+	UU	0	0
+readid02	chr1	100	chr2	20	-	+	UU	0	0
+readid03	chr1	100	chr2	20	+	+	UU	0	0
+readid04	chr1	499	chr2	20	+	+	UU	1	0
+readid05	chr1	600	chr2	20	+	+	UU	2	0
+readid06	chr1	1	chr2	200	+	+	UU	0	0
+readid07	chr1	1	chr2	500	+	+	UU	0	1
+readid08	chr1	10001	chr2	10001	+	+	UU	2	1
diff --git a/tests/test_restrict.py b/tests/test_restrict.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+
+from nose.tools import assert_raises
+
+import subprocess
+
+testdir = os.path.dirname(os.path.realpath(__file__))
+
+def test_restrict():
+    """Restrict pairs file"""
+    mock_pairs_path = os.path.join(testdir, "data", "mock.test-restr.pairs")
+    mock_rfrag_path = os.path.join(testdir, "data", "mock.rsites.bed")
+    try:
+        result = subprocess.check_output(
+            [
+                "python",
+                "-m",
+                "pairtools",
+                "restrict",
+                "-f",
+                mock_rfrag_path,
+                mock_pairs_path,
+            ],
+        ).decode("ascii")
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+        print(sys.exc_info())
+        raise e
+
+    # check if the header got transferred correctly
+    true_header = [l.strip() for l in open(mock_pairs_path, "r") if l.startswith("@")]
+    output_header = [l.strip() for l in result.split("\n") if l.startswith("#")]
+    for l in true_header:
+        assert any([l in l2 for l2 in output_header])
+
+    # check that the pairs got assigned properly
+    cols = [x for x in output_header if x.startswith('#columns')][0].split(' ')[1:]
+
+    COL_RFRAG1_TRUE = cols.index('rfrag_test1')
+    COL_RFRAG2_TRUE = cols.index('rfrag_test2')
+    COL_RFRAG1_OUTPUT = cols.index('rfrag1')
+    COL_RFRAG2_OUTPUT = cols.index('rfrag2')
+
+    for l in result.split("\n"):
+        if l.startswith("#") or not l:
+            continue
+
+        line = l.split()
+        assert line[COL_RFRAG1_TRUE] == line[COL_RFRAG1_OUTPUT]
+        assert line[COL_RFRAG2_TRUE] == line[COL_RFRAG2_OUTPUT]