-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathinsert-ibd-segments.py
253 lines (231 loc) · 10.5 KB
/
insert-ibd-segments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# Written 2013 by Peter Ralph and Graham Coop
#
# contact: [email protected]
#
# To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty.
#
# You should have received a copy of the CC0 Public Domain Dedication along with this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
#
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
description = '''Merge two sets of genomes, randomly creating ibd segments within one set.
Output only on lines shared by both sets.
'''
import gzip
import random
import sys
from optparse import OptionParser
from math import isnan, exp, log
import pdb
def fileopt(fname,opts):
'''Return the file referred to by fname, open with options opts;
if fname is "-" return stdin/stdout; if fname ends with .gz run it through gzip.
'''
if fname == "-":
if opts == "r":
fobj = sys.stdin
elif opts == "w":
fobj = sys.stdout
else:
print "Something not right here."
elif fname[len(fname)-3:len(fname)]==".gz":
fobj = gzip.open(fname,opts)
else:
fobj = open(fname,opts)
return fobj
def parsesite(snpsite):
'''Read in a line from a snp file;
keeping the two haplotypes of an individual together.
'''
snpsite = snpsite.split()
code = snpsite[0]
name = snpsite[1]
snpsite = [ snpsite[2*k:2*k+2] for k in range(1,len(snpsite)/2) ]
return code,name,snpsite
def pairfiles(sf,tf,mapd):
'''Read in sites from two files until their next matching site.
Note that mapd enties should be in the form:
mapd[ name ] = float(mappos), chrom, int(bp)
'''
sline = sf.readline()
tline = tf.readline()
while sline != '' and tline != '':
scode,sname,ssite = parsesite(sline)
tcode,tname,tsite = parsesite(tline)
smap = mapd[sname][0]
tmap = mapd[tname][0]
sbp = mapd[sname][2]
tbp = mapd[tname][2]
if isnan( sbp ) or isnan( smap ):
# print "skipping ", sname
sline = sf.readline()
elif isnan( tbp ) or isnan( tmap ):
# print "skipping ", tname
tline = tf.readline()
elif sbp < tbp:
# print "skipping ", sname, sbp, tbp
sline = sf.readline()
elif tbp < sbp:
# print "skipping ", tname, sbp, tbp
tline = tf.readline()
else:
# print "reading ", sname, tname, sbp, tbp
yield scode, sname, ssite, tsite, mapd[sname]
sline = sf.readline()
tline = tf.readline()
parser = OptionParser(description=description)
parser.add_option("-i","--ibdfile",dest="ibdfile",help="name of input file to copy segments within (or '-' for stdin)",default="-")
parser.add_option("-j","--otherfile",dest="otherfile",help="name of other input file to merge with (or '-' for stdin)")
parser.add_option("-o","--outfile",dest="outfile",help="name of output file (or '-' for stdout)",default="-")
parser.add_option("-m","--mapfile",dest="mapfile",help="name of map file (or '-' for stdin)")
parser.add_option("-g","--logfile",dest="logfile",help="name of log file (or '-' for stdout)",default="insert-ibd-segments.log")
parser.add_option("-a","--minwait",dest="minwait",help="Minimum length to wait before copying another segment (default 2cM)", default=2.0)
parser.add_option("-b","--maxwait",dest="maxwait",help="Maximum length to wait before copying another segment (default 3cM)", default=3.0)
parser.add_option("-x","--minlen",dest="minlen",help="Minimum length of segment (default 0.5cM)", default=0.5)
parser.add_option("-y","--maxlen",dest="maxlen",help="Maximum length of segment (default 10cM)", default=10.0)
parser.add_option("-u","--mutprob",dest="mutprob",help="Mutation probability per site (default=.002)", default=0.002)
parser.add_option("-s","--missprob",dest="missprob",help="Missing probability per site (default=.023)", default=0.023)
# parser.add_option("-v","--mutable",dest="mutable",help="Proportion of individuals to apply 5x higher mutation rates to.", default=.1)
(options,args) = parser.parse_args()
ibdfile = fileopt(options.ibdfile, "r")
otherfile = fileopt(options.otherfile, "r")
outfile = fileopt(options.outfile, "w")
mapfile = fileopt(options.mapfile, "r")
logfile = fileopt(options.logfile, "w")
minwait = float(options.minwait)
maxwait = float(options.maxwait)
minlen = float(options.minlen)
maxlen = float(options.maxlen)
mutprob = float(options.mutprob)
missprob = float(options.missprob)
# mutable = float(options.mutable)
def random_seglen(maxlen=maxlen,minlen=minlen):
'''random segment length to be copied -- uniform on a log scale'''
return exp( log(maxlen/minlen)*random.uniform(0,1)+log(minlen) )
# Missing code
missing_code = '0'
# # TEST
# ibdfile = fileopt("short1_chr22.beagle", "r")
# otherfile = fileopt("short2_chr22.beagle", "r")
# mapfile = fileopt("/home/peter/projects/ibd/data/genetic_maps/marker.genetic22.gmap", "r")
# outfile = fileopt("INSERTED-test-output", "w")
# logfile = fileopt("-","w")
# minwait = 1.0
# maxwait = 5.0
# Read in mapfile
mapheader = mapfile.readline().split()
mapd = {}
for line in mapfile:
# of the form CHROM NAME MAPPOS BPPOS
chrom,name,mappos,bp = line.split()
if mappos == "NA":
mappos = "NaN" # translate to python
mapd[ name ] = float(mappos), chrom, int(bp)
# set things up
ibdheader = ibdfile.readline().split()
otherheader = otherfile.readline().split()
# record the ids of the individuals
ibdids = [ x for i,x in enumerate(ibdheader[2:]) if i%2 ]
otherids = [ x for i,x in enumerate(otherheader[2:]) if i%2 ]
outheader = ibdheader + otherheader[2:]
outfile.write(" ".join(outheader)+"\n")
# Number of samples
nibd = len(ibdheader)/2-1
nother = len(otherheader)/2-1
nsamps = nibd + nother
logfile.write("# Working on " + str(nsamps) + " samples.\n")
logfile.write("id1 id2 start end\n")
# create the iterator
stfiles = pairfiles(ibdfile, otherfile, mapd)
# Read until first matching site
code, name, ssite, tsite, mapsite = stfiles.next()
# Maintain lists of when each individual is next allowed to copy
# and initialize them offset from the first position
swait = [ mapsite[0]+random.uniform(0,1) for x in ssite ]
# Also maintain the list of who is copying from where:
# ...this is either an integer in range(nibd) or -1, meaning "not copying"
# ... and a 0 or 1 for which haplotype
order = [ (-1,-1,-1) for x in range(nibd) ]
# and the list of how long to copy for
copyuntil = [ 0 for x in swait ]
# and keep list of segments in a dict
segments = {}
# ... and the set of individuals last copied to
lastcopied = [ [-1,-1] for x in range(nibd) ]
# output first line
outfile.write(" ".join( [code,name] + [" ".join(x) for x in ssite+tsite] )+"\n")
# for error checking
allnums = []
for code, name, ssite, tsite, mapsite in stfiles:
# print str(mapsite[0]) + ":" + str(swait)
# terminate segments
for j in range(nibd):
if copyuntil[j] <= mapsite[0]:
i = order[j][0]
if (i,j) in segments:
seg = segments.pop( (i,j) )
# record site we began on and last site we wrote to file as start and end of segment
logfile.write(" ".join( [ ibdids[i], ibdids[j], str(seg[0]), str(mapd[lastsite][0]) ] ) + "\n" )
order[j] = -1,-1,-1
# choose new sources and targets
ibdable = [ j for j,x in enumerate(swait) if x <= mapsite[0] ]
ibdable = random.sample( ibdable, len(ibdable)-len(ibdable)%2 )
sourceable = ibdable[:len(ibdable)/2]
targetable = ibdable[len(ibdable)/2:2*(len(ibdable)/2)]
newseglens = [ random_seglen() for i in sourceable ]
# print str(zip( sourceable, targetable, newseglens ))
for i,j,x in zip( sourceable, targetable, newseglens ):
# don't copy if we've recently copied these individuals
if not i in lastcopied[j] and not j in lastcopied[i]:
# copy from i to j for x length on randomly chosen haplotype
order[j] = [i] + random.sample([0,1],2)
lastcopied[i].append(j); lastcopied[i].pop(0)
lastcopied[j].append(i); lastcopied[j].pop(0)
copyuntil[j] = x+mapsite[0]
swait[i] = swait[j] = copyuntil[j] + random.uniform(minwait,maxwait)
# record chosen segments
segments[i,j] = mapsite[0],copyuntil[j]
# logfile.write(" ".join( [ ibdids[i], ibdids[j], str(mapsite[0]), str(copyuntil[j]) ] ) + "\n" )
# find set of alleles
alleles = set( sum(ssite+tsite,[]) )
alleles.discard(missing_code)
# record numbers of alleles (for error checking)
if alleles:
allnums.append(len(alleles))
# don't output if it's monomorphic
if alleles and len(alleles)>1:
# copy over segments
ssite = [ ( [x,y] if (i == -1) else ( [x,ssite[i][u]] if v else [ssite[i][u],y] ) ) for (i,u,v),(x,y) in zip(order, ssite) ]
# for visualization debugging:
# ssite = [ ( [x,y] if (i == -1) else ( [x,'*'] if v else ['*',y] ) ) for (i,u,v),(x,y) in zip(order, ssite) ]
# insert mutations
mutate = [ random.uniform(0,1) < mutprob for k in ssite ]
ssite = [ ( [x,y] if not mut else ( [alleles.difference([x]).pop(),y] if random.choice([0,1]) else [x,alleles.difference([y]).pop()] ) ) for (x,y),mut in zip(ssite,mutate) ]
# insert missings
missing = [ random.uniform(0,1) < missprob for k in ssite ]
ssite = [ ( [x,y] if not miss else [missing_code,missing_code] ) for (x,y),miss in zip(ssite, missing) ]
# output
# print "--", code, name, " mutated: ", sum(mutate), " missing: ", sum(missing), ssite, tsite
outfile.write(" ".join( [code,name] + [" ".join(x) for x in ssite+tsite] )+"\n")
lastsite = name
# else:
# print "Skipping monomorphic site ", mapsite[0]
# output remaining segments falling off the ends
for (i,j) in segments:
seg = segments[ (i,j) ]
logfile.write(" ".join( [ ibdids[i], ibdids[j], str(seg[0]), str(mapd[lastsite][0]) ] ) + "\n" )
for j in range(nibd):
if copyuntil[j] <= mapsite[0]:
i = order[j][0]
if (i,j) in segments:
seg = segments[ i,j ]
# record site we began on and last site we wrote to file as start and end of segment
order[j] = -1,-1,-1
if any( [ x>2 for x in allnums ] ):
alltab = [ allnums.count(x) for x in set(allnums) ]
print "WARNING:"
print "More than three alleles at some sites:", alltab
ibdfile.close()
otherfile.close()
outfile.close()
logfile.close()