-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathparse-sims-fns.R
270 lines (247 loc) · 13.7 KB
/
parse-sims-fns.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# Written 2013 by Peter Ralph and Graham Coop
#
# contact: [email protected]
#
# To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty.
#
# You should have received a copy of the CC0 Public Domain Dedication along with this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
#
#
coalprob <- function ( gens, opts, nesize=opts$nesize, migprob=opts$migprob, ploidy=2 ) {
# compute coalescent probs from nesize and migprob, recursively
# note that gens is in *generations* = 2*meioses
#
if (is.null(dim(migprob(1)))) {
migprobfn <- function (t) { x <- migprob(t); dim(x) <- c(1,length(x)); return(x) }
} else {
migprobfn <- migprob
}
if (is.null(colnames(migprobfn(1)))) {
states <- 1:ncol(migprobfn(1))
} else {
states <- colnames(migprobfn(1))
}
coal <- array(0,dim=c(max(gens),length(states),length(states)))
dimnames(coal) <- list( NULL, states, states )
# this indexes where coalescences can occur
samestates <- c( as.vector( diag(length(states))>0 ), FALSE )
# transition matrix for pairs: last state is the graveyard (coalescence)
transprobs <- cbind( rbind( diag(length(states)) %x% diag(length(states)), 0 ), 0 )
for (t in 1:max(gens)) {
if (!missing(opts) & t>opts$ngens) {
# only ran the simulation so long
coal[t,,] <- 0
} else {
# nesize works in *diploid* individuals, so actual coalescence is smaller by a factor of 1/ploidy
C <- as.vector(1/(ploidy*nesize(t)))
M <- cbind( rbind( migprobfn(t) %x% migprobfn(t), 0 ), 0 )
M[ samestates, ] <- (1-C)*M[ samestates, ]
M[ samestates, ncol(M) ] <- C
transprobs <- transprobs %*% M
coal[t,,] <- transprobs[-nrow(M),ncol(M)]
}
}
return(coal[match(gens,1:max(gens)),,,drop=FALSE])
}
mergerate <- function ( x, L, opts, minminlen=0.1, gaplen=5 ) {
# approx for the proportion of falsely merged blocks
thesegens <- 1:(max(attr(L,"gens"))/2)
coalgens <- coalprob(thesegens,opts)
f <- function (t,x) { ( gaplen*t/100*(1+x*t/100) - 2*(1+x*t/100+(1/2)*(x*t/100)^2) )*exp(-x*t/100) }
sum( sqrt(2)*sum(.chrlens-x)*(2*thesegens/100)*( f(2*thesegens,x)-f(2*thesegens,2*minminlen) ) * coalgens )
}
predict.blocks <- function ( L, opts, times=FALSE ) {
# L gives number of blocks per constant rate of coalescence in windows of numbers of *meioses*;
# coalprob works in 2*meioses (generations) here; so half of these are zero;
# here we put in those zeros.
coalgens <- coalprob(1:(max(attr(L,"gens"))/2),opts)
coal <- numeric( 2*prod(dim(coalgens)) )
coal[ 2*(1:prod(dim(coalgens))) ] <- coalgens
dim(coal) <- c( 2*dim(coalgens)[1], dim(coalgens)[-1] )
coal <- apply( coal, c(2,3), function (x) diff(c(0,cumsum(x)[attr(L,"gens")]))/diff(c(0,attr(L,"gens"))) )
sampsize <- opts$sampsize
npairs <- outer(sampsize,sampsize,"*")
diag(npairs) <- choose(sampsize,2)
coal <- coal * rep(npairs, each=dim(coal)[1])
dim(coal) <- c( dim(coal)[1], length(coal)/dim(coal)[1] )
coal <- coal[,upper.tri(npairs,diag=TRUE),drop=FALSE]
if (times) {
blocklens <- L * rep(coal,each=nrow(L))
dim(blocklens) <- c(dim(L),dim(coal)[2])
dimnames(blocklens) <- list( NULL, NULL, paste( rownames(npairs)[row(npairs)], colnames(npairs)[col(npairs)], sep="-" )[upper.tri(npairs,diag=TRUE)] )
} else {
blocklens <- L%*%coal
colnames(blocklens) <- paste( rownames(npairs)[row(npairs)], colnames(npairs)[col(npairs)], sep="-" )[upper.tri(npairs,diag=TRUE)]
}
return( (blocklens) )
}
# make stuff easier functions
loadblocks <- function ( opts, ploidy=2 ) {
blocks <- read.table(opts$ibdfile,header=TRUE,sep=" ")
# blocks$id1.orig <- blocks$id1
# blocks$id2.orig <- blocks$id2
blocks$id1 <- blocks$id1 %/% ploidy
blocks$id2 <- blocks$id2 %/% ploidy
blocks$chrom <- findInterval( blocks$start, .chrstarts/100 )
blocks$mapstart <- blocks$start*100 - .chrstarts[blocks$chrom] # these sims have recorded in cumulative distance
blocks$mapend <- blocks$end*100 - .chrstarts[blocks$chrom]
blocks$maplen <- with(blocks, mapend-mapstart)
blocks$country1 <- names(opts$sampsizes)[ findInterval( blocks$id1, cumsum( c(0, unlist( opts$sampsizes ) ) ) ) ]
blocks$country2 <- names(opts$sampsizes)[ findInterval( blocks$id2, cumsum( c(0, unlist( opts$sampsizes ) ) ) ) ]
blocks$countrypair <- with( blocks, ifelse(country1<country2,paste(country1,country2,sep="-"),paste(country2,country1,sep="-")) )
return(blocks)
}
init.sinv <- function( lendist, lenbins, npairs, L, fp ) {
S <- as.vector(smooth(lendist))
S <- pmax(1e-4, S/mean(S) )
# tail of S becomes 0
est.Sigma <- function ( np, X, alpha=100 ) {
# estimate Sigma by combining the population average with the smoothed, observed data
# where Sigma is the covariance matrix of X/np
smX <- as.vector( smooth(X) )
ppp <- (alpha*np/(npairs+alpha*np)) # 2543640 = total # pairs
Sigma <- ppp*smX + (1-ppp)*sum(smX)*S/sum(S)
return( Sigma/np^2 )
}
Ksvd <- svd( 1/sqrt(as.vector(S)) * L )
linverse <- linv( X=lendist/npairs, S=as.vector(S), Sigma=est.Sigma(np=npairs,X=lendist), Ksvd=Ksvd, maxk=30, fp=fp )
return( smoothed.nonneg( lS=linverse, Ksvd=Ksvd, bestk=15, lam=1 ) )
}
plot.ans <- function (anslist,opts,thispair,L,genscale=TRUE,coalrate=TRUE,dothese,main,legend1=TRUE,legend2=FALSE, tcols, plots=c("coal","spectrum"), spectrum.xlim, spectrum.ylim, ...) {
# plot nice stuff about a named list of sinv objects
if (missing(opts) && 'opts'%in%names(anslist)) { opts <- anslist[['opts']] }
if ( missing(L) && 'L'%in%names(anslist) ) { L <- anslist[['L']] }
if ('anslist'%in%names(anslist)) { anslist <- anslist[['anslist']] }
if ( (!missing(thispair)) && (thispair %in% names(anslist))) { anslist <- anslist[[thispair]] }
if (missing(dothese)) { dothese <- names(anslist)[sapply(anslist,class)=="sinv"] }
if (missing(tcols)) { tcols <- rainbow_hcl(length(dothese)) }
if (is.null(names(dothese))) { names(dothese) <- dothese }
if (missing(main) && !missing(thispair)) {
main <- thispair
} else if (missing(main)) {
main <- ""
} else if (length(main)==1) { main <- rep(main,2) }
if (genscale) { timescale <- 1/2 } else { timescale <- 29/2 }
npairs <- outer(opts$sampsizes,opts$sampsizes,"*")
diag(npairs) <- choose(opts$sampsizes,2)
gens <- attr(L,"gens")
lenbins <- attr(L,"lenbins")
midbins <- c(lenbins,max(.chrlens))[-1]-diff(c(lenbins,max(.chrlens)))/2
binsizes <- diff(c(lenbins,max(.chrlens)))
coal <- coalprob(gens%/%2,opts)
cdn <- dimnames(coal)
dim(coal) <- c(dim(coal)[1],prod(dim(coal)[-1]))
dimnames(coal) <- list( NULL, outer(cdn[[2]], cdn[[3]], paste, sep="-") )
coal <- coal[,upper.tri(npairs,diag=TRUE),drop=FALSE]
# account for coal working in generations
coal <- coal/2
npairs <- npairs[upper.tri(npairs,diag=TRUE)]
predicted <- predict.blocks(L,opts)
if (missing(thispair)) {
coal <- rowSums(coal*npairs[col(coal)]/sum(npairs))
predicted <- rowSums(predicted*npairs[col(predicted)]/sum(npairs))
} else {
coal <- coal[,thispair]
predicted <- predicted[,thispair]
}
coalvals <- data.frame(c( list(theoretical=coal), lapply(dothese, function (x) anslist[[match(x,names(anslist))]]$par) ))
# plot coalescent rates or number of ancestors?
if (coalrate) {
coallab <- "coal rate"
} else {
coalvals <- data.frame( lapply( coalvals, coal.to.anc, gens ) )
coallab <- "# shared ancestors"
}
spectra <- data.frame( c( list("theoretical"=predicted/binsizes, "observed"=anslist[[1]]$lendist/binsizes),
lapply(dothese, function (x) with(anslist[[x]], (npairs * (L%*%par))/binsizes ) )
) )
### BEGIN PLOTTING
if ("coal" %in% plots) {
plot( gens*timescale, rowMeans(coalvals), type='n', ylab=coallab, xlab=if(genscale){"generations ago"} else {"years ago"}, main=main[1], ... )
for (k in seq_along(dothese)) {
polygon( c(gens,rev(gens))*timescale, c(coalvals[[names(dothese)[k]]],rep(0,length(gens))), col=adjustcolor(tcols[k],.4) )
}
lines( gens*timescale, coalvals$theoretical, col='green', lwd=2 ) # generations
if (is.numeric(opts$ngens)) { abline(v=opts$ngens*timescale*2, col='grey', lty=2) }
if (legend1) { legend("topright", fill=c(NA,tcols[seq_along(dothese)]), border=c("green",rep("black",length(dothese))), legend=c("theoretical",names(dothese)) ) }
}
# predicted and observed blocks
if ("spectrum" %in% plots) {
if (missing(spectrum.xlim)) { spectrum.xlim <- range(midbins[is.finite(log(midbins))]) }
if (missing(spectrum.ylim)) { spectrum.ylim <- range((spectra$theoretical/npairs)[is.finite(log(spectra$theoretical/npairs))]) }
plot( midbins, spectra$theoretical/npairs, type='l', col='green', lwd=2, log='xy', xlab="block length (cM)", ylab="density", main=main[2], xlim=spectrum.xlim, ylim=spectrum.ylim )
lines( midbins, spectra$observed/npairs )
for (k in match(dothese,names(anslist))) {
lines( midbins, spectra[[names(dothese)[k]]]/npairs, col=tcols[k] )
}
if (legend2) { legend("topright", lty=1, lwd=c(2,1,rep(1,length(anslist))), col=c("green","black",tcols[seq_along(dothese)]), legend=c("theoretical","observed",names(anslist))) }
}
return( invisible( list(gens=gens, coal=coalvals, midbins=midbins, spectra=spectra, npairs=npairs, lenbins=lenbins ) ) )
}
do.everything <- function (prefix,minblocklen,fprate=function(x)0,maxgen,genbins=NULL) {
opts <- simopts[[prefix]] # of note is nesize and ngens
blocks <- loadblocks(opts)
if (missing(minblocklen)) { minblocklen <- opts$minlen }
if (missing(maxgen)) { maxgen <- opts$ngens }
indivinfo <- data.frame( SUBJID=1:sum(opts$sampsizes)-1, COUNTRY_SELF=names(opts$sampsizes)[unlist(lapply(1:length(opts$sampsizes),function(k) rep(k,opts$sampsizes[k])))] )
nsamples <- table(indivinfo$COUNTRY_SELF)
npairs <- outer(nsamples,nsamples,"*")
diag(npairs) <- choose(nsamples,2)
names(npairs) <- paste( names(nsamples)[row(npairs)], names(nsamples)[col(npairs)], sep="-" )
npairs <- npairs[upper.tri(npairs,diag=TRUE)]
# get a good discretization
nbins <- 100
maxbinsize <- 1
lenbins <- with( subset(blocks,maplen>minblocklen), quantile( maplen, probs=seq(0,1,length.out=nbins+1) ) )
lenbins <- c( lenbins[c(diff(lenbins)<maxbinsize,FALSE)], seq( lenbins[max(which(diff(lenbins)<maxbinsize))+1], max(lenbins), maxbinsize ) )
lenbins <- lenbins[-length(lenbins)]
lenbins[1] <- minblocklen
binsizes <- diff(c(lenbins,max(.chrlens)))
midbins <- c( lenbins[-1]-diff(lenbins)/2, lenbins[length(lenbins)] )
maxbin <- 5*ceiling(max(blocks$maplen)/5)
lendist <- do.call( cbind, with( subset(blocks,maplen>minblocklen), tapply( maplen, countrypair, function (x) hist(x, breaks=c(lenbins,maxbin), plot=FALSE)$counts ) ) )
# gens <- cumsum( rep(1:36, each=10) )
gens <- 1:(2*maxgen)
L <- theoretical.operator( lenbins=lenbins, gens=gens, chrlens=.chrlens )
fp <- rep(0, length(lenbins))
# add in unaccounted-for false positives
predicted.blocklens <- predict.blocks( L, opts )
oops <- rpois( length(lendist), fprate(midbins)*predicted.blocklens )
true.lendist <- lendist
lendist <- lendist + oops
sminverse <- init.sinv(rowSums(lendist), lenbins, sum(npairs), L, fp)
ans <- sinv( rowSums(lendist), xinit=sminverse, L, fp, npairs=sum(npairs), lam=0, gamma=1e6 )
bothans <- lapply( colnames(lendist), function (x) {
noconstraints <- sinv( lendist[,x], xinit=ans, L, fp, npairs=npairs[x], gamma=10, lam=10 )
list(
pointy = noconstraints,
smooth = squoosh( lendist[,x], xinit=noconstraints, L, fp, npairs=npairs[x], gamma=100, lam=0, relthresh=2/npairs[x] ),
smoother = squoosh( lendist[,x], xinit=noconstraints, L, fp, npairs=npairs[x], gamma=100, lam=0, relthresh=4/npairs[x] ),
lower.bounds <- lapply( genbins, function (gb) squoosh( lendist[,x], npairs[x], xinit=noconstraints, L, fp, lam=0, gamma=0, del=1, delvec=( (gens>=gb[1]) & (gens<gb[2]) ) ) ),
upper.bounds <- lapply( genbins, function (gb) squoosh( lendist[,x], npairs[x], xinit=noconstraints, L, fp, lam=0, gamma=0, del=1, delvec=( (gens<gb[1]) | (gens>=gb[2]) ) ) )
) }
)
names(bothans) <- colnames(lendist)
return( list(opts=opts, anslist=bothans, L=L, fp=fp, npairs=npairs ) )
}
## automatically get stuff out of python dict text format
parsedict <- function(x) {
# Parse the text string corresponding to a python dict
x <- gsub("\\<None\\>", "NA", gsub(":", "=", strsplit( gsub( "}.*$", "", gsub( "^.*\\{", "", x ) ), "," )[[1]] ) )
x <- paste( "list(", paste( x, collapse=", " ), ")" )
x <- gsub("'([0-9.]{1,})'", "\\1", x )
return( eval(parse(text=x)) )
}
getoptions <- function (prefix, logfilename ) {
# read in command-line options from the log file
logfilenames <- list.files( ".", paste("^", prefix, ".*\\.log$",sep="") )
if (length(logfilenames)>1) {
stop("Ambiguous log file prefix.")
} else {
logfilename <- logfilenames[1]
}
opts <- parsedict( system( paste( "grep '^options'", logfilename ), intern=TRUE ) )
names(opts)[names(opts)=="sampsizes"] <- "opts.sampsizes"
opts$sampsizes <- parsedict( system( paste( "grep '^sampsizes:'", logfilename ), intern=TRUE ) )
return( opts )
}