-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbam_to_tbi.py
194 lines (147 loc) · 6.18 KB
/
bam_to_tbi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import numpy as np
import pysam
import subprocess
import argparse
import utils
import os, pdb
MIN_MAP_QUAL = 10
def parse_args():
parser = argparse.ArgumentParser(description=" convert bam data format to bigWig data format, "
" for ribosome profiling and RNA-seq data ")
parser.add_argument("--dtype",
choices=("rnaseq","riboseq"),
default="riboseq",
help="specifies the type of assay (default: riboseq)")
parser.add_argument("bam_file",
action="store",
help="path to bam input file")
options = parser.parse_args()
options.bgzip = which("bgzip")
options.tabix = which("tabix")
return options
def which(program):
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
def convert_rnaseq(options):
# file names and handles
count_file = os.path.splitext(options.bam_file)[0]
sam_handle = pysam.Samfile(options.bam_file, "rb")
count_handle = open(count_file, 'w')
for cname,clen in zip(sam_handle.references,sam_handle.lengths):
# fetch reads in chromosome
sam_iter = sam_handle.fetch(reference=cname)
# initialize count array
counts = dict()
for read in sam_iter:
# skip read if unmapped
if read.is_unmapped:
continue
# skip read, if mapping quality is low
if read.mapq < MIN_MAP_QUAL:
continue
if read.is_reverse:
site = read.pos + read.alen - 1
else:
site = read.pos
try:
counts[site] += 1
except KeyError:
counts[site] = 1
# write counts to output file
indices = np.sort(counts.keys())
for i in indices:
count_handle.write('\t'.join([cname,'%d'%i,'%d'%(i+1),'%d'%counts[i]])+'\n')
print "completed %s"%cname
sam_handle.close()
count_handle.close()
# compress count file
pipe = subprocess.Popen("%s -f %s"%(options.bgzip, count_file), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
# index count file
pipe = subprocess.Popen("%s -f -b 2 -e 3 -0 %s.gz"%(options.tabix, count_file), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
print "Compressed file with RNA-seq counts is %s"%(count_file+'.gz')
def convert_riboseq(options):
# file names and handles
fwd_count_file = os.path.splitext(options.bam_file)[0]+'_fwd'
rev_count_file = os.path.splitext(options.bam_file)[0]+'_rev'
sam_handle = pysam.Samfile(options.bam_file, "rb")
fwd_handle = dict([(r,open(fwd_count_file+'.%d'%r, 'w')) for r in utils.READ_LENGTHS])
rev_handle = dict([(r,open(rev_count_file+'.%d'%r, 'w')) for r in utils.READ_LENGTHS])
for cname,clen in zip(sam_handle.references,sam_handle.lengths):
# fetch reads in chromosome
sam_iter = sam_handle.fetch(reference=cname)
# initialize count arrays
fwd_counts = dict([(r,dict()) for r in utils.READ_LENGTHS])
rev_counts = dict([(r,dict()) for r in utils.READ_LENGTHS])
for read in sam_iter:
# skip reads not of the appropriate length
if read.rlen not in utils.READ_LENGTHS:
continue
# skip read if unmapped
if read.is_unmapped:
continue
# skip read, if mapping quality is low
if read.mapq < MIN_MAP_QUAL:
continue
if read.is_reverse:
asite = int(read.positions[-13])
try:
rev_counts[read.rlen][asite] += 1
except KeyError:
rev_counts[read.rlen][asite] = 1
else:
asite = int(read.positions[12])
try:
fwd_counts[read.rlen][asite] += 1
except KeyError:
fwd_counts[read.rlen][asite] = 1
# write counts to output files
for r in utils.READ_LENGTHS:
indices = np.sort(fwd_counts[r].keys())
for i in indices:
fwd_handle[r].write('\t'.join([cname, '%d'%i, '%d'%(i+1), '%d'%fwd_counts[r][i]])+'\n')
indices = np.sort(rev_counts[r].keys())
for i in indices:
rev_handle[r].write('\t'.join([cname, '%d'%i, '%d'%(i+1), '%d'%rev_counts[r][i]])+'\n')
print "completed %s"%cname
sam_handle.close()
for r in utils.READ_LENGTHS:
fwd_handle[r].close()
rev_handle[r].close()
for r in utils.READ_LENGTHS:
# compress count file
pipe = subprocess.Popen("%s -f %s.%d"%(options.bgzip, fwd_count_file, r), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
pipe = subprocess.Popen("%s -f %s.%d"%(options.bgzip, rev_count_file, r), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
# index count file
pipe = subprocess.Popen("%s -f -b 2 -e 3 -0 %s.%d.gz"%(options.tabix, fwd_count_file, r), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
pipe = subprocess.Popen("%s -f -b 2 -e 3 -0 %s.%d.gz"%(options.tabix, rev_count_file, r), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
print "Compressed file with ribosome footprint counts on forward strand is %s"%(fwd_count_file+'.%d.gz'%r)
print "Compressed file with ribosome footprint counts on reverse strand is %s"%(rev_count_file+'.%d.gz'%r)
if __name__=="__main__":
options = parse_args()
if options.dtype=="rnaseq":
convert_rnaseq(options)
elif options.dtype=="riboseq":
convert_riboseq(options)