-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGattacaFinish.py
425 lines (363 loc) · 18.9 KB
/
GattacaFinish.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
import sys
import os
from collections import OrderedDict
import argparse
try:
import ConfigParser as configparser # for python 2
except:
import configparser # for python 3
import JavaCodeGen
from gattaca_postplots import Plots
import glob
from scipy.stats import binom, gamma, poisson
import pickle
import time
from gattaca_classes import GeneLocs, FastaRecords
def Parser():
# get user variables
parser = argparse.ArgumentParser()
requiredNamed = parser.add_argument_group("Required Arguments")
inFile = parser.add_argument("-i", "--file", dest="inFile", default="./tests/stemCA.3.0/", help="Output file or directory from simulations.", required=True) #TODO change to true
output = parser.add_argument("-o", "--output", dest="output", default="./tests/stemCA.3.0/gattaca/", help="Destination for directory to be created.", required=True)
geneList = parser.add_argument("-f", "--geneList", dest="geneList", default="./tests/TestGenes.txt", help="List of genes, one per line to use for GattacaExample.Gattaca.", required=True)
genome = parser.add_argument("-g", "--genome", dest="genome", default="GRCh37.75", help="Reference genome to use.")
depth = parser.add_argument("-d", "--depth", dest="depth", default=500, help="Sequencing depth. Default 500.")
depthDist = parser.add_argument("-k", "--depthFile", dest="depthFile", default=None, help="If a distribution for depth is desired use a file with one depth per variant per line.")
cutoff = parser.add_argument("-c", "--cutoff", dest="cutoff", default=0.005, help="Minimum variant allele frequency (higher depth=lower cutoff recommended). Default is a generous 0.005.")
name = parser.add_argument("-n", "--name", dest="name", default="gattacaRun", help="Name of run. Default=gattacaRun.")
tmpts = parser.add_argument('-t', '--timepoints', dest="tmpts", nargs='+', help='List of timepoints. Leave blank if you only want the last timepoint.', required=False)
plots = parser.add_argument("-p", "--getPlots", dest="plots", default=True, action='store_false', help="Specifies whether to create a BAM file for use post simulation.")
vcfs = parser.add_argument("-vcf", "--getvcfs", dest="vcfs", default=False, action='store_true', help="Whether or not to construct VCF files from mutations.")
verbose = parser.add_argument("-v", "--verbose", dest="verbose", default=True, action='store_false', help="Whether verbose output is turned on (development/debugging).")
Options = parser.parse_args() # main user args
Options.output=os.path.expanduser(Options.output)
Options.inFile=os.path.expanduser(Options.inFile)
if Options.output[len(Options.output)-1]!="/":
Options.output+="/"
if Options.inFile[len(Options.inFile)-1]!="/":
Options.inFile+="/"
# try:
# Options.mutRate = float(Options.mutRate)
# except:
# sys.exit("Improper mutation rate argument. Ensure this can be parsed as a decimal.")
return(Options)
def ConfigSectionMap(section, Config):
dict1 = {}
options = Config.options(section)
for option in options:
try:
dict1[option] = Config.get(section, option)
if dict1[option] == -1:
print("skip: %s" % option)
except:
print("exception on %s!" % option)
dict1[option] = None
return dict1
class Data:
def __init__(self, Options):
# Raw Information
self.data = Options.inFile
print("Processing %s"%(self.data))
self.tmpts = None
self.cloneIDs = None
self.parentIDs = None
self.parsedData = OrderedDict() # Holds clone information with full mutational genome
self._readFile()
self.lineages = {}
start_time = time.clock()
self._buildLineages()
self._buildFullTimepointGenome()
self.contexts = self._getOrder()
print("Finished building lineages: %.2f seconds"%(time.clock()-start_time))
# Stats
'''Provides information on a per timepoint snapshot {time 1: {clone info}, time 2: {clone info}, etc.}'''
self.mutsAtTimepoint = []
self.clonesWithMutsAtTimepoint = []
self.popsWithMutsAtTimepoint = []
self.popSize = []
self.mutContexts = []
self.VAFs = []
start_time = time.clock()
self._getMutContexts()
print("Finished extracting mutation contexts: %.2f seconds"%(time.clock()-start_time))
start_time = time.clock()
self._getRawVAFs(Options)
print("Finished getting VAFs: %.2f seconds"%(time.clock()-start_time))
# Get Muts per clone per timepoint
def _readFile(self):
with open(self.data, "r") as theFile:
lines = theFile.readlines()
lines = [line.replace('\n','').split(',') for line in lines]
idxTmptsStart = [i for i, k in enumerate(lines[0]) if k.startswith("ParentID")][0]+1
self.tmpts = [int(float(i)) for i in lines[0][idxTmptsStart:len(lines[0])]]
self.cloneIDs = []
self.parentIDs = []
for i, vals in enumerate(lines[1:]):
self.cloneIDs.append(vals[4])
self.parentIDs.append(vals[5])
self.parsedData.update({ int(vals[4]) : {'CloneID': vals[4], 'hsv':(vals[1],vals[2],vals[3]), 'ParentID': vals[5], 'Pops': [i for i in vals[idxTmptsStart:len(vals)]], 'Genome': vals[0], 'NumMuts': 0 } })
def _buildFullTimepointGenome(self):
vals = [0 for i in range(0,99)]
for i, c in enumerate(self.cloneIDs):
# print("Building full genome for clone: " + c)
genome = self.parsedData[int(c)]['Genome']
assert int(self.lineages[c][0])==int(c), "Clones may not match Clone: %s, Ancestor 1st Clone: %s (these two should be equal)."%(c, ancestor[0])
for j, ancestor in enumerate([int(item) for item in self.lineages[c]]):
if ancestor!=int(c) and ancestor != 0:
genome+=self._getClonesPrivateGenome(ancestor)
self.parsedData[int(c)]['Genome']=genome
self.parsedData[int(c)]['NumMuts']=len(genome.split(';'))
def _getClonesPrivateGenome(self, cloneID):
return(self.parsedData[int(cloneID)]['Genome'])
def _buildLineages(self):
for i, clone in enumerate(self.cloneIDs):
lineage = [clone]
p = self.parentIDs[i] # starting parent
lineage.append(p)
while p!='0': # continue until parent is 0 and clone is 0
# parent of every parent.
p = self._getCloneIDofParent(p)
lineage.append(p)
self.lineages.update({clone:lineage})
def _getCloneIDofParent(self, parent):
for i, clone in enumerate(self.cloneIDs):
if clone==parent:
return(self.parentIDs[i])
def _IDoNotKnowWhatWasHere(self, clone):
for k, parent in enumerate(self.cloneIDs):
if parent==clone:
return(parent)
def _getOrder(self):
bases = ['A','C','G','T']
muts = ['[C>A]','[C>G]','[C>T]','[T>A]','[T>C]','[T>G]']
vals = []
for pos2 in range(0, 6):
for pos1 in range(0,4):
for pos3 in range(0,4):
vals.append(bases[pos1]+muts[pos2]+bases[pos3])
return(vals)
def _getMutContexts(self):
for i, t in enumerate(self.tmpts):
self._totalPop(i)
self._getMutationsAtTmpt(i)
out = OrderedDict.fromkeys(self.contexts)
for k in out:
out[k] = 0
for mut in self.mutsAtTimepoint[i]:
out[mut.split('.')[3]]+=1
self.mutContexts.append(out)
# print(self.mutContexts)
def _getMutationsAtTmpt(self, timeIdx):
muts = []
for c in self.cloneIDs:
if int(self.parsedData[int(c)]['Pops'][timeIdx]) != 0:
m = self.parsedData[int(c)]['Genome'].split(';')
for singleMut in m:
if singleMut != '':
muts.append(singleMut)
self.mutsAtTimepoint.append(list(set(muts)))
def _totalPop(self, timeIdx):
val = 0
for c in self.cloneIDs:
val += int(self.parsedData[int(c)]['Pops'][timeIdx])
self.popSize.append(val)
def _getRawVAFs(self, Options):
start_time = time.clock()
cellsWithMut = OrderedDict()
for i, muts in enumerate(self.mutsAtTimepoint):
theMuts={}
for mut in muts:
theMuts.update({mut: {'cells': 0.0, 'rawVAF': 0.0, 'correctedVAF': 0.0, 'reads': 0, 'depth': 0}})
cellsWithMut.update({i:theMuts})
if(Options.verbose):
print("Setup muts: %.2f seconds"%(time.clock()-start_time))
start_time = time.clock()
for clone in self.parsedData: # Clone
g = self.parsedData[int(clone)]['Genome'].split(';')
for mut in g:
if mut != '':
for i, val in enumerate(self.parsedData[int(clone)]['Pops']):
if int(val)>0:
cellsWithMut[i][mut]['cells']=int(val)
if (Options.verbose):
print("Cells extracted: %.2f seconds" % (time.clock() - start_time))
# for i, muts in enumerate(self.mutsAtTimepoint):
# theMuts={}
# for mut in muts:
# theMuts.update({mut: {'cells': 0.0, 'rawVAF': 0.0, 'correctedVAF': 0.0, 'reads': 0, 'depth': 0}})
# thePop = 0
# for c in self.clonesWithMutsAtTimepoint[i]:
# if mut in self.parsedData[int(c)]['Genome']:
# thePop+=int(self.parsedData[int(c)]['Pops'][i])
# theMuts[mut]['cells']=thePop
#
# cellsWithMut.update({i:theMuts})
start_time = time.clock()
for t, tmpt in enumerate(self.tmpts):
for mut in cellsWithMut[t]:
try:
cells = cellsWithMut[t][mut]['cells']
diploidPop = 2.0*self.popSize[t]
vaf = cells/diploidPop
cellsWithMut[t][mut]['rawVAF']=vaf
except ZeroDivisionError:
vaf = 0.0
cellsWithMut[t][mut]['rawVAF']=vaf
if cellsWithMut[t][mut]['rawVAF'] > 1.0:
print("DEBUG time: %s"%(t))
print("DEBUG Cells: %s"%(cells))
print("DEBUG VAF: %s"%(vaf))
print("DEBUG MUT: %s"%(mut))
print("Population: %s"%(self.popSize[t]))
print(self.popsWithMutsAtTimepoint[t][mut])
print(cellsWithMut[t][mut])
assert cellsWithMut[t][mut]['rawVAF'] <= 1.0, "VAF Greater than 1.0 error: %s"%(cellsWithMut[t][mut]['rawVAF'])
self.VAFs = cellsWithMut
if(Options.verbose):
print("Calculating VAFs: %.2f seconds"%(time.clock()-start_time))
start_time = time.clock()
self._getCorrectedVAF(Options)
if(Options.verbose):
print("Sampling and correcting VAFs: %.2f seconds"%(time.clock()-start_time))
def _getCorrectedVAF(self, Options):
'''
Gets a corrected VAF based on the rawVAF using either a poisson distribution w/
binomial or a gamma fit dist if options are selected
:param Options: Command arguments
:return: None. Sets values in a dictionary.
'''
# TODO need to add GAMMA distribution fits from a file
if Options.depthFile!=None:
pass
else:
for t in self.VAFs:
for mut in self.VAFs[t]:
vaf = self.VAFs[t][mut]['rawVAF']
try:
self.VAFs[t][mut]['reads'] = binom.rvs(Options.depth, vaf)
except ValueError: # TODO vaf shouldn't ever be more than 1.0 but it is.
self.VAFs[t][mut]['reads'] = binom.rvs(Options.depth, 1.0)
self.VAFs[t][mut]['depth'] = float(poisson.rvs(Options.depth))
# print("%s reads, %s depth"%(fi,Di))
self.VAFs[t][mut]['correctedVAF'] = self.VAFs[t][mut]['reads']/self.VAFs[t][mut]['depth']
def BuildOutputTable(Options, data, snpeff, refGenome):
geneLocs = GeneLocs(Options, snpeff)
pickle.dump(geneLocs, open('file.p', 'wb')) # TODO Delete when done with dev
geneLocs = pickle.load(open('file.p', 'rb')) # TODO Delete when done with dev
fastaRec = FastaRecords(geneLocs, Options, snpeff, refGenome)
pickle.dump(fastaRec, open('file2.p', 'wb')) # TODO Delete when done with dev
fastaRec = pickle.load(open('file2.p', 'rb')) # TODO Delete when done with dev
revDict = {'t': 'a', 'a': 't', 'g': 'c', 'c': 'g'}
outline = []
for i, sim in enumerate(data):
if Options.tmpts==None: # Means we are getting every tmpt
for k, t in enumerate(sim.tmpts):
for v, mut in enumerate(sim.mutsAtTimepoint[k]):
# Sample Tmpt Chrom Pos Ref Alt Gene Induction_t Trinuc TrinucTrue RawVAF CorrVAF Reads Depth
mutInfo = mut.split('.')
ref = mutInfo[3].split('[')[1].split('>')[0]
alt = mutInfo[3].split('[')[1].split('>')[1].split(']')[0]
vafDat = sim.VAFs[k][mut]
val = [os.path.basename(sim.data).replace('.csv',''), t, mutInfo[4], mutInfo[5], ref, alt, mutInfo[2], mutInfo[0], mutInfo[3][0] + alt + mutInfo[3][6], mutInfo[3][0] + alt + mutInfo[3][6], vafDat['rawVAF'], vafDat['correctedVAF'], vafDat['reads'], vafDat['depth']]
outline.append(val)
else: # TODO build so that you get only specificed tmpts
pass
outFinal = []
for mutOut in outline:
for rec in fastaRec.geneSeq:
if rec[3]==mutOut[6]:
start = int(rec[1])
end = int(rec[2])
if mutOut[4].lower()==rec[0][int(mutOut[3])-start-1]:
outFinal.append(mutOut)
else:
mutOut[4]=rec[0][int(mutOut[3])-start-1].upper()
mutOut[5]=revDict[mutOut[5].lower()].upper()
mutOut[9]=revDict[mutOut[9][0].lower()].upper()+revDict[mutOut[9][1].lower()].upper()+revDict[mutOut[9][2].lower()].upper()
outFinal.append(mutOut)
WriteVCF(Options, outFinal)
vcfData = AnnotateVCF(Options, snpeff, Options.output + "tmp.vcf")
anotherList = []
for mutOut in outFinal:
key = mutOut[2] + "\t" + mutOut[3] + "\t" + mutOut[4] + "\t" + mutOut[5]
effectField = vcfData[key].split('\t')[7].split(";")[2].split("=")[1].split("|")
effect = effectField[1]
mutOut.append(effect)
anotherList.append(mutOut)
with open(Options.output + "mutations.txt", "w") as outfile:
outfile.write("Sample\tTmpt\tChrom\tPos\tRef\tAlt\tGene\tInduction_t\tTrinuc\tTrinucTrue\tRawVAF\tCorrVAF\tReads\tDepth\tEff\n")
for item in anotherList:
item = [str(i) for i in item]
outfile.write("\t".join(item) + "\n")
def BuildOutputCloneData(Options, data):
outline = []
for i, sim in enumerate(data):
for clone in sim.parsedData:
line = [os.path.basename(sim.data).replace('.csv',''), sim.parsedData[clone]['CloneID'], sim.parsedData[clone]['ParentID'], len(sim.parsedData[clone]['Genome'].split(';')), '\t'.join(sim.parsedData[clone]['Pops'])]
outline.append('\t'.join([str(item) for item in line]))
with open(Options.output + "mutations.clonal.txt", "w") as outCounts:
outCounts.write('\t'.join(["Sample","CloneID", "ParentID", "NumMuts",'\t'.join([str(i) for i, v in enumerate('\t'.join(sim.parsedData[0]['Pops']))])]) + "\n")
outCounts.write("\n".join(outline))
def AnnotateVCF(Options, snpeff, inFile):
print("Annotating mutations...")
cmd = ["java", "-jar", snpeff['snpeff'], "ann", "-canon", Options.genome, inFile]
locInfo = os.popen(' '.join(cmd)).read()
with open(inFile.replace('.vcf','.ann.vcf'), 'w') as outfile:
outfile.write(locInfo)
print("Annotation Complete...")
os.remove(inFile)
out = {}
for line in locInfo.split('\n'):
if line.startswith("#")==False and line!='':
k = line.split('\t')
key = k[0] + "\t" + k[1] + "\t" + k[3] + "\t" + k[4]
out.update({ key : line})
return(out)
def WriteVCF(Options, mutOut):
header = "##fileformat=VCFv4.2\n##fileDate=%s\n##source=GattacaExample.Gattaca\n##reference=%s\n##contig=None\n##phasing=partial"%(time.asctime( time.localtime(time.time()) ),Options.genome)
header2 = "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">"
header3 = "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n##FORMAT=<ID=VR,Number=1,Type=Integer,Description=\"Variant Read Depth\">"
header4 = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tALLSAMPLES"
seen = []
with open(Options.output + "tmp.vcf", 'w') as outFile:
outFile.write('\n'.join([header,header2,header3,header4]) + "\n")
for m in mutOut:
out = [m[2],m[3],".",m[4],m[5],".","PASS"]
if out not in seen:
infoField = "DP=%s;AF=%s\tGT:DP:VR\t0/1:%s:%s"%(int(m[13]),round(m[11],5),int(m[13]),m[12])
outFile.write('\t'.join(out) + "\t" + infoField + "\n")
seen.append(out)
if __name__=="__main__":
# Pull information about usr system files
localpath = os.path.abspath(__file__).replace('GattacaFinish.py', '') # path to scripts working directory
Config = configparser.ConfigParser()
Options = Parser()
Config.read(localpath + "usr_paths.ini")
snpeff = ConfigSectionMap(Config.sections()[0], Config) # get annovar script paths
refGenome = ConfigSectionMap(Config.sections()[1], Config) # get reference genome
print("Processing GattacaExample.Gattaca Outputs.")
if os.path.isdir(os.path.expanduser(Options.output))==False:
os.mkdir(os.path.expanduser(Options.output))
data = []
if os.path.isfile(os.path.expanduser(Options.inFile)):
data = Data(Options)
elif os.path.isdir(os.path.expanduser(Options.inFile)):
allFiles = glob.glob(Options.inFile + "*.csv")
Options2 = Options
for i in allFiles:
Options2.inFile = i
data.append(Data(Options2))
else:
sys.exit("No input file(s) found.")
#
#
# # # TODO build endpoint table or table with all mutations.
# pickle.dump(data, open('postProcess.p', 'wb')) # TODO Delete when done with dev
# data = pickle.load(open('postProcess.p', 'rb')) # TODO Delete when done with dev
BuildOutputCloneData(Options, data)
BuildOutputTable(Options, data, snpeff, refGenome)
if Options.plots:
build = Plots(Options,data)
# build.mutContextMovie(Options, data)
# build.mutVAF(Options, data)
build.oneOverF(Options, data)