-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusters_to_fams_noinfernal.py
333 lines (308 loc) · 13.3 KB
/
clusters_to_fams_noinfernal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
from sys import stdout
from os.path import exists
from os import mkdir, walk, remove
from cogent.app.muscle_v38 import align_unaligned_seqs
from cogent.app.infernal_v11 import cmbuild_from_file, calibrate_file
from cogent.parse.fasta import MinimalFastaParser
from cogent import LoadSeqs, RNA
from time import time
from datetime import datetime
from multiprocessing import Pool, Manager
import argparse
from math import floor
from selextrace.stutils import cluster_seqs, write_fasta_list
from selextrace.bayeswrapper import bayesfold
from selextrace.ctilib import fold_clusters, group_by_shape, \
run_fold_for_infernal, group_by_forester, group_by_distance,\
make_r2r, run_infernal
if __name__ == "__main__":
starttime = time()
parser = argparse.ArgumentParser(description="Runs sequence grouping \
and infernal over all rounds of a SELEX selection")
parser.add_argument('-i', required=True,
help="FASTA file of unique sequences sorted by abundance.")
parser.add_argument('-f', required=True,
help="Base folder holding FASTA files for all rounds of selection")
parser.add_argument('-o', required=True,
help="Base folder to output all data")
parser.add_argument('-r', required=True, type=int,
help="Current round of selection clusters come from")
parser.add_argument('--sim', required=True, type=float, default=0.99,
help="Simmilarity for uclust. (Default 0.99)")
parser.add_argument('--minseqs', type=int, default=100,
help="Min number of seqs for group to be significant (Default 100)")
parser.add_argument('--nr', action='store_true',
help="No reference sequence is available (used for totally random sequences)")
parser.add_argument('--rsc', type=float, default=25,
help="Score cutoff for RNAdistance (Default 25)")
parser.add_argument('--fsc', type=int, default=150,
help="Score cutoff for RNAforester. (Default 150)")
parser.add_argument('-c', type=int, default=1,
help="Number of CPUs to use (Default 1)")
args = parser.parse_args()
if args.r < 1:
print "ERROR: round must be at least 1!"
exit(1)
if args.c < 1:
print "ERROR: CPU count must be at least 1!"
exit(1)
if args.fsc < 0:
print "ERROR: RNAforester score cutoff must be greater than 0!"
exit(1)
if args.sim < 0.0 or args.sim > 1.0:
print "ERROR: Infernal score cutoff must be >= 0 and <= 1!"
exit(1)
structscore = args.rsc
infernalscore = args.isc
otufolder = args.o
if args.f[:-1] != "/":
args.f += "/"
if otufolder[:-1] != "/":
otufolder += "/"
if not exists(otufolder):
mkdir(otufolder)
date = str(datetime.now())
print "Program started ", date
#print out run info to a file
infofile = open(otufolder + "runparams.txt", 'w')
infofile.write(''.join(["Program started ", date, "\n",
"FASTA file:\t", args.i, "\n",
"Base folder:\t", args.f, "\n",
"Output folder:\t", args.o, "\n"
"Selection round:\t", str(args.r), "\n",
"Uclust simmilarity:\t", str(args.sim), "\n",
"Min seqs for group:\t", str(args.minseqs), "\n",
"RNAdistance score cutoff:\t", str(args.rsc), "\n",
"RNAforester score cutoff:\t", str(args.fsc), "\n",
"Infernal min score:\t", str(args.isc), "\n",
"CPUs:\t", str(args.c), "\n"]))
infofile.close()
print "==Clustering sequences by primary sequence=="
clusters = {}
secs = time()
if exists(otufolder + "clusters.txt"):
clustersin = open(otufolder + "clusters.txt")
currclust = ""
for header, seq in MinimalFastaParser(clustersin):
if "cluster_" in header:
currclust = header
clusters[currclust] = []
else:
clusters[currclust].append((header, seq))
clustersin.close()
print "Sequences previously clustered,", len(clusters), "clusters"
else:
print "Running uclust over sequences"
#cluster the initial sequences by sequence simmilarity
clusters = cluster_seqs(args.i, args.sim, folderout=args.o,
gapopen='10.0', gapext='10.0')
#print that shit to file
cout = open(otufolder + "clusters.txt", 'w')
hold = clusters.keys()
hold.sort()
for cluster in hold:
cout.write(">%s\n%s\n" % (cluster, cluster))
for seq in clusters[cluster]:
cout.write(">%s\n%s\n" % seq)
cout.close()
print str(len(clusters)) + " clusters"
print "Runtime: " + str((time() - secs)/60) + " min"
if not exists(otufolder + "cluster_structs.fasta"):
#create file to write to if not already there
cfo = open(otufolder + "cluster_structs.fasta", 'w')
cfo.close()
print "Running BayesFold over " + str(len(clusters)) + " clusters"
secs = time()
#make a pool of workers, one for each cpu available
manager = Manager()
pool = Pool(processes=args.c)
lock = manager.Lock()
#run the pool over all clusters to get file of structures
for cluster in clusters:
pool.apply_async(func=fold_clusters, args=(lock, cluster,
clusters[cluster], otufolder))
pool.close()
pool.join()
#read in all structures now that they are folded
structgroups = {}
groups_shape = {}
count = 0
cfo = open(otufolder + "cluster_structs.fasta", 'rU')
nostruct = []
for cluster, struct in MinimalFastaParser(cfo):
count += 1
#remove unstructured groups
if struct.count("(") < 5:
nostruct.append(cluster)
continue
#create dictionary of sequences now keyed to structures
if struct in structgroups:
structgroups[struct].extend(clusters[cluster.split(" ")[0]])
else:
structgroups[struct] = clusters[cluster.split(" ")[0]]
#create groups_shape dictionary of abstract shapes
abshape = cluster.split(" ")[1]
if abshape in groups_shape:
groups_shape[abshape].append(struct)
else:
groups_shape[abshape] = [struct]
cfo.close()
if count != len(clusters):
raise AssertionError(str(count) + " structures, " + str(len(clusters))
+ " clusters. Not all clusters folded!")
else:
print "Clusters previously folded"
clusters.clear()
del clusters
if not exists(otufolder + "clusters_aln.fasta"):
startaln = time()
if not args.nr:
#add refseq and align all sequences for later seq/struct comparisons
refin = open(args.f + "refseq.fasta")
crap, refseq = MinimalFastaParser(refin).next()
refin.close()
for struct in structgroups:
if not args.nr:
structgroups[struct].append(("refseq", refseq))
structgroups[struct] = align_unaligned_seqs(structgroups[struct], RNA)
if not args.nr:
#hacky way to make sure refseq first, will need to fix later
structgroups[struct].Names.remove("refseq")
structgroups[struct].Names.insert(0, "refseq")
#write that shit to a file to save time if rerun needed
cout = open(otufolder + "clusters_aln.fasta", 'w')
for struct in structgroups:
cout.write(">%s\n%s\n" % ("newcluster", struct))
cout.write(structgroups[struct].toFasta() + "\n")
cout.close()
print "Aligned all clusters: " + str((time() - startaln)/60) + " min"
else:
cin = open(otufolder + "clusters_aln.fasta")
currclust = []
currstruct = ""
structgroups = {}
first = True
for header, seq in MinimalFastaParser(cin):
if "cluster" in header:
if not first:
structgroups[currstruct] = LoadSeqs(data=currclust, moltype=RNA, aligned=True)
if not args.nr:
structgroups[currstruct].Names.remove("refseq")
structgroups[currstruct].Names.insert(0, "refseq")
first = False
currstruct = seq
currclust = []
else:
currclust.append((header, seq))
print "Clusters previously aligned"
print "Runtime: " + str((time() - secs)/60) + " min"
print "==Grouping clusters by secondary structure=="
#GROUP THE SECONDARY STRUCTURES BY RNADISTANCE
secs = time()
skipiter = False
if exists(otufolder + "fasta_groups/"):
skipiter = True
if not skipiter:
print "start: " + str(len(structgroups)) + " initial groups"
#initial clustering by structures generated in first folding,
#broken out by shapes. No comparison needed at first if not same shape,
#as most likely not simmilar enough
files = []
hold = {}
pool = Pool(processes=args.c)
#run the pool over all shape groups to get final grouped structgroups
fout = open(otufolder + "shapesizes.txt", 'w')
groupnum = 1
for shapegroup in groups_shape.keys():
#write out each group to file for use in subprocess
groupinfo = {struct: structgroups[struct] for struct in groups_shape[shapegroup]}
fout.write(shapegroup + "\t" + str(len(groupinfo)) + "\n")
pool.apply_async(func=group_by_distance,
args=(groupinfo, structscore), callback=hold.update)
structgroups.clear()
del structgroups
stime = time()
hold.update(group_by_distance(groupinfo, structscore, None, None, args.nr))
print len(groupinfo), "clusters:", str((time()-stime)/60), "min"
fout.close()
#memory saving wipe of structgroups, groups_shape, and groupinfo
groups_shape.clear()
del groups_shape
groupinfo.clear()
del groupinfo
pool.close()
pool.join()
print "Grouped ("+str((time()-stime)/60)+" min)"
structgroups = dict(hold)
del hold
#sort all structure sequences by count, highest to lowest
for struct in structgroups:
if not args.nr:
structgroups[struct].Names.remove('refseq')
structgroups[struct].Names.sort(reverse=True,
key=lambda count: int(count.split('_')[1]))
print str(len(structgroups))+" end groups ("+str((time()-secs)/60)+" min)"
#write out fasta files for groups: header of each sequence in the group
mkdir(otufolder+"fasta_groups")
for num, group in enumerate(structgroups):
structgroups[group].degap().writeToFile(otufolder+"fasta_groups/group_"+str(num)+".fasta", "fasta")
else:
print "Previously grouped"
#wipe out structgroups dict to save memory
structgroups.clear()
del structgroups
print "==Creating CM and r2r structures=="
secs = time()
#smaller pool for memeory savings, 1 task per child to try and gc each run
#NEED TO FIX BAYESFOLD ARRAY2D BEING A MEMORY HOG
pool = Pool(processes=int(args.c/2), maxtasksperchild=1)
#run the pool over all groups to get final structures
for group in walk(otufolder + "fasta_groups").next()[2]:
groupnum = group.split("_")[-1].split(".")[0]
pool.apply_async(func=run_fold_for_infernal,
args=(groupnum, otufolder+"fasta_groups/"+group, otufolder, args.minseqs))
pool.close()
pool.join()
#get sequence counts for each group
infernalorder = []
groups = {}
count = 0
for group in walk(otufolder).next()[1]:
if group == "fasta_groups":
continue
count += 1
#read in group sequence counts
log = open(otufolder + group + "/log.txt")
loginfo = log.readlines()
log.close()
infernalorder.append((group, int(loginfo[1].split()[0]),
int(loginfo[2].split()[0])))
#read in group structure and build dict for families creation
structin = open(otufolder + group + "/bayesfold-aln.fasta")
struct = structin.read().split("\n")[-2].strip()
structin.close()
groups[struct] = [group]
#write out file of sequence counts
infernalorder.sort(reverse=True, key=lambda x: x[1])
groupsizefile = open(otufolder + "/group_sizes.txt", 'w')
groupsizefile.write("Group\tTotal Seqs\tUnique Seqs\n")
for info in infernalorder:
groupsizefile.write("%s\t%s\t%s\n" % info)
groupsizefile.close()
print count, "final groups"
print "Runtime: " + str((time() - secs) / 3600) + " hrs"
print "==Creating families from groups=="
print "RNAforester score cutoff:", args.fsc
print len(groups), "initial groups"
#Now group by forester local aignment to get larger families grouped
groups = group_by_forester(groups, args.fsc, args.c)
fout = open(otufolder + "families.txt", 'w')
for fam in groups:
fout.write("\t".join(groups[fam]) + "\n")
fout.close()
print len(groups), "final groups"
print "Runtime: " + str((time() - secs) / 3600) + " hrs"
groups.clear()
del groups
print "Program ended ", datetime.now(), " Runtime: " + \
str((time() - starttime)/3600) + "h"