-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathvalidation-merge.py
146 lines (129 loc) · 5.67 KB
/
validation-merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os, os.path, sys
import csv, gzip
import itertools
from mergesort import mergesort
class SeqDict(object):
def __init__(self, f):
self.f = f
with open(f, 'r') as fi:
_struct = []
for line in fi.readlines():
_struct.append([c.strip() for c in line.split('\t')])
self._struct = _struct
def contigs(self):
result = []
for s in self._struct:
result.append(s[1].split(':',2)[1])
return result
def cmprecords(a, b, seqorder):
achrorder = seqorder.index(a['Chromosome'])
bchrorder = seqorder.index(b['Chromosome'])
if achrorder != bchrorder:
return achrorder - bchrorder
else:
return int(a['Start_Position']) - int(b['Start_Position'])
def batch(merges, seqorder):
def mafkeyfun(myobj):
class K(object):
def __init__(self, obj, *args):
self.obj = obj
def __cmp__(self, other):
return cmprecords(self.obj, other.obj, seqorder)
b = []
for record in mergesort.mergepeek(merges, mafkeyfun):
if b and cmprecords(b[0], record, seqorder) != 0:
yield b
b = [record] # reset
else:
b.append(record)
# find the lowest record in the merges
yield b
yield None # a sentinal value, sends None but does not stop iteration (that happens on the next call)
def record_update(record, r):
for key in ('Verification_Status', 'Validation_Status', 'Mutation_Status'):
record_val = record.get(key, None)
if record_val is None or record_val == '.':
record[key] = r.get(key, '.')
elif record_val in ('Unknown', 'Untested', 'Somatic'):
r_val = r.get(key, None)
if r_val is not None:
record[key] = r_val
# updates in place, this returns None
def main(args):
seqdict = SeqDict(args.SEQDICT)
contigs = seqdict.contigs()
primary = csv.DictReader(itertools.ifilter(lambda x: not x.startswith('#'), args.PRIMARY), delimiter = '\t')
merges = [itertools.ifilter(lambda x: x['Chromosome'] in contigs, csv.DictReader(itertools.ifilter(lambda x: not x.startswith('#'), i), delimiter = '\t')) for i in args.MERGES]
# assume that merges and primary are sorted
writer = csv.DictWriter(args.OUTPUT, delimiter = '\t', fieldnames = primary.fieldnames if 'MERGESOURCE' in primary.fieldnames else primary.fieldnames + ['MERGESOURCE'], extrasaction = 'ignore', restval = '.')
primary = itertools.ifilter(lambda x: x['Chromosome'] in contigs, primary)
writer.writeheader()
batchiter = batch(merges, contigs)
nextbatch = batchiter.next()
print "Batching %(Chromosome)s|%(Start_Position)s|%(Hugo_Symbol)s" % nextbatch[0]
record = primary.next()
while record and nextbatch:
cmp = cmprecords(record, nextbatch[0], contigs)
if cmp < 0:
record['MERGESOURCE'] = 'PRIMARY'
writer.writerow(record)
try:
record = primary.next()
except StopIteration:
record = None
elif cmp > 0:
# we have passed the batch so write out the remaining items
for r in nextbatch:
r['MERGESOURCE'] = 'PASTE'
writer.writerow(r)
nextbatch = batchiter.next() # move up the batch records
if nextbatch:
print "Batching %(Chromosome)s|%(Start_Position)s|%(Hugo_Symbol)s" % nextbatch[0]
else:
print "Done with batches!"
else:
# we hit paydirt :)
newnextbatch = []
for r in nextbatch:
for matchstr in ('End_Position', 'Tumor_Sample_Barcode', 'Tumor_Seq_Allele2', 'Reference_Allele'):
if r.get(matchstr, None) != record.get(matchstr, None):
newnextbatch.append(r)
break # breaks the for
else:
# case where all match
# this is a match and so we should update the record with the info in r
record_update(record, r) # this imprints the value and we can drop the r from the nextbatch entries
record['MERGESOURCE'] = 'MERGE'
writer.writerow(record)
try:
record = primary.next()
except StopIteration:
record = None
if newnextbatch:
nextbatch = newnextbatch
else:
nextbatch = batchiter.next() # we used up all of the records at this position
print "Batching %(Chromosome)s|%(Start_Position)s|%(Hugo_Symbol)s" % nextbatch[0]
if record is not None:
record['MERGESOURCE'] = 'PRIMARY'
writer.writerow(record)
for record in primary: # lets keep reading
record['MERGESOURCE'] = 'PRIMARY'
writer.writerow(record)
if nextbatch is not None:
for r in nextbatch:
r['MERGESOURCE'] = 'PASTE'
writer.writerow(r)
for nextbatch in batchiter:
for r in nextbatch:
r['MERGESOURCE'] = 'PASTE'
writer.writerow(r)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('PRIMARY', type = argparse.FileType('r'), help = 'input file or stream')
parser.add_argument('SEQDICT', type = str, help = 'input sequence dict as generated by Picard')
parser.add_argument('OUTPUT', type = argparse.FileType('w'), help = 'output file')
parser.add_argument('MERGES', type = argparse.FileType('r'), nargs='+', help = 'input file or streams to merge')
args = parser.parse_args()
main(args)