Skip to content

Commit

Permalink
2.9.4 UB tag write speed
Browse files Browse the repository at this point in the history
  • Loading branch information
cziegenhain committed Sep 12, 2020
1 parent 18c8933 commit 4e23bf9
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 54 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria.

19 July 2020: [zUMIs2.9.3](https://github.com/sdparekh/zUMIs/releases/tag/2.9.3): Add zUMIs version number to header of unmapped bam files. Several bug fixes: prevent error during mapping with memory handling; incorrect Smart-seq3 UMI-fragment counting.

14 July 2020: [zUMIs2.9.2](https://github.com/sdparekh/zUMIs/releases/tag/2.9.2): Several bug fixes: Prevent RAM from ballooning, issues with resuming from different stage. Speed up demultiplexing further by chrosome-wise operations. Remove need for second bam file sorting after hamming collapse by keeping sort order.
Expand Down
84 changes: 50 additions & 34 deletions UMIstuffFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,21 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
if(inex){
taglist <- c(taglist, "GI")
}

rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
if(opt$read_layout == "PE"){
parms <- ScanBamParam(tag=taglist,
what="pos",
parms <- ScanBamParam(tag=taglist,
what="pos",
flag = scanBamFlag(isFirstMateRead = TRUE),
tagFilter = list(BC = chunk_bcs),
which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
}else{
parms <- ScanBamParam(tag=taglist,
what="pos",
parms <- ScanBamParam(tag=taglist,
what="pos",
tagFilter = list(BC = chunk_bcs),
which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
}

dat <- scanBam(file = featfile, param = parms)
if(inex){
dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
Expand All @@ -95,14 +95,14 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
rsamtools_reads[ , ftype :="NA"][
is.na(GEin)==F, ftype :="intron"][
is.na(GE)==F , ftype:="exon"][
is.na(GE) , GE:=GEin][
is.na(GE) , GE:=GEin][
,GEin:=NULL ]
}else{
rsamtools_reads[, ftype :="NA"][
is.na(GE)==F, ftype :="exon"]
}
setkey(rsamtools_reads,RG)

if(keepUnassigned){
return( rsamtools_reads )
}else{
Expand Down Expand Up @@ -299,7 +299,7 @@ collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
names(ll$downsampling)<-subNames
ll
}, mc.cores = length(mapList))

}


Expand Down Expand Up @@ -334,6 +334,22 @@ write_molecule_mapping <- function(mm){
}
}

correct_UB_tags_new <- function(inbamfile,n){
mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/",n,".")
outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
bcpath <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes_binned.txt")
use_threads <- opt$num_threads
pypath <- paste0(opt$zUMIs_directory,"/correct_UBtag.py")
UBcmd <- paste("python3", pypath,
"--bam",inbamfile,
"--out",outbamfile,
"--p",use_threads,
"--bcs",bcpath,
"--stub",mm_path)
system(UBcmd)
return(outbamfile)
}

correct_UB_tags <- function(bccount, samtoolsexc){
mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
demux_path <- paste0(opt$out_dir,"/zUMIs_output/demultiplexed/")
Expand Down Expand Up @@ -365,46 +381,46 @@ demultiplex_bam <- function(opt, bamfile, nBCs, samtoolsexc, bccount){
if(!dir.exists( paste0(opt$out_dir,"/zUMIs_output/demultiplexed/") )){
dir.create( paste0(opt$out_dir,"/zUMIs_output/demultiplexed/") )
}

installed_py <- try(system("pip freeze", intern = TRUE, ignore.stderr = TRUE), silent = TRUE)
suppressWarnings(if(grepl('Error', installed_py)){
installed_py <- try(system("pip3 freeze", intern = TRUE, ignore.stderr = TRUE), silent = TRUE)
})

if(any(grepl("pysam==",installed_py))){
print("Using python implementation to demultiplex.")
print(Sys.time())
max_filehandles <- as.numeric(system("ulimit -n", intern = TRUE))
threads_perBC <- floor(max_filehandles/nBCs)

if(max_filehandles < nBCs | nBCs > 10000){
#print("Warning! You cannot open enough filehandles for demultiplexing! Increase ulimit -n")
#break up in several demultiplexing runs to avoid choke
nchunks <- ifelse(max_filehandles < nBCs, no = ceiling(nBCs/10000), yes = ceiling(nBCs/(max_filehandles-100)))
if(nchunks == 1){nchunks = 2}
print(paste("Breaking up demultiplexing in",nchunks,"chunks. This may be because you have >10000 cells or a too low filehandle limit (ulimit -n)."))

full_bclist <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes.txt")
bcsplit_prefix <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,"kept_barcodes.")

split_cmd <- paste0("split -a 3 -n l/",nchunks," ",full_bclist, " ", bcsplit_prefix)
system(split_cmd)
bclist <- list.files(path = paste0(opt$out_dir,"/zUMIs_output/"), pattern = paste0(".",opt$project,"kept_barcodes."),all.files = TRUE, full.names = TRUE)
header_cmd <- paste('sed -i -e \'1s/^/XC,n,cellindex\\n/\'', bclist[-1], collapse = '; ', sep = ' ')
system(header_cmd)

if(max_filehandles < nBCs){threads_perBC <- 1}
}else{
bclist <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes.txt")
}

if(threads_perBC > 2){
threads_perBC <- 2
}
threads_decompress <- opt$num_threads - threads_perBC
py_script <- paste0(opt$zUMIs_directory,"/misc/demultiplex_BC.py")
print("Demultiplexing zUMIs bam file...")

if(threads_decompress > 10 & nBCs < 2500){ #if capacity is there, do demultiplexing parallelised per chromosome
collect_demultiplex = TRUE #set a flag to remember to collect the output chunks later
demux_cmd <- "sleep 1" #set a decoy system command
Expand All @@ -429,7 +445,7 @@ demultiplex_bam <- function(opt, bamfile, nBCs, samtoolsexc, bccount){
, collapse = "; ")
system(pysam_cmd)
}, mc.cores = threads_chromosomes)

}else{
collect_demultiplex = FALSE
outstub <- paste0(opt$out_dir,"/zUMIs_output/demultiplexed/",opt$project,".")
Expand All @@ -443,7 +459,7 @@ demultiplex_bam <- function(opt, bamfile, nBCs, samtoolsexc, bccount){
"--chr", "allreads"
, collapse = "; ")
}

}else{
print("Using perl implementation to demultiplex.")
demux_cmd <- paste0(opt$zUMIs_directory,"/misc/demultiplex_BC.pl ",opt$out_dir," ",opt$project, " ", bamfile, " ", samtoolsexc )
Expand Down Expand Up @@ -576,46 +592,46 @@ RPKM.calc <- function(exprmat, gene.length){

.intronProbability<-function(featfile,bccount,inex,cores,samtoolsexc,saf, allC){
print("Fetching reads from bam files again to calculate intron scores...")

nchunks <- length(unique(bccount$chunkID))
all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")

for(i in unique(bccount$chunkID)){
rgfile <- all_rgfiles[i]
chunks <- bccount[chunkID==i]$XC
write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
}

headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
write(headerXX,"freadHeader")

headercommand <- "cat freadHeader > "
layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x UB -x EN -x IN -x GE -x GI", layoutflag, "-@")

outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
system(paste(headercommand,outfiles,collapse = "; "))

cpusperchunk <- round(cores/nchunks,0)

grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/ES:Z://g' | sed 's/IS:Z://g' | grep -F -f "
inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

system(paste(inex_cmd,"wait"))
system("rm freadHeader")
system(paste("rm",all_rgfiles))

#continue with reading the data in
for(i in unique(bccount$chunkID)){
print(paste("Working on barcode chunk", i, "out of", length(unique(bccount$chunkID))))
print(paste("Processing", length(bccount[chunkID == i]$XC), "barcodes in this chunk..."))
samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",i,".txt")

reads<-data.table::fread(samfile, na.strings=c(""),
select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","ES","IS") )
reads <- reads[ES == "Unassigned_NoFeatures" & IS == "Unassigned_NoFeatures"]
system(paste("rm",samfile))
system(paste("rm",samfile))

scores_out<-.calculateProbabilityScores(reads = reads,
saf = saf,
bccount = bccount[chunkID == i],
Expand All @@ -642,8 +658,8 @@ RPKM.calc <- function(exprmat, gene.length){

#get intronic bp per gene
tmp<-merge(x=tmp,y=saf$intronsPerGene, by.x="GE", by.y="GeneID")


dt<-merge(x=dt,y=tmp, by="RG", all.x=T)


Expand All @@ -660,4 +676,4 @@ check_read_layout <- function(bamfile){
isbamPE <- suppressMessages(Rsamtools::testPairedEndBam(bamfile))
found_read_layout <- ifelse(isbamPE == TRUE, "PE", "SE")
return(found_read_layout)
}
}
20 changes: 12 additions & 8 deletions barcodeIDFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ setDownSamplingOption<-function( down ,bccount, filename=NULL){
}
colnames(subsample.splits)<-c("minR","maxR")

if(nrow(subsample.splits) == 0){
subsample.splits <- setDownSamplingOption(down = "0", bccount = bccount, filename = filename)
}

return( subsample.splits )
}

Expand Down Expand Up @@ -236,7 +240,7 @@ BCbin <- function(bccount_file, bc_detected) {
order(-n)][
!( XC %in% true_BCs ) ]
nocell_BCs <- nocell_bccount[,XC]

if(opt$barcodes$BarcodeBinning>0){
#break up in pieces of 1000 real BCs in case the hamming distance calculation gets too large!
true_chunks <- split(true_BCs, ceiling(seq_along(true_BCs)/1000))
Expand Down Expand Up @@ -266,24 +270,24 @@ BCbin <- function(bccount_file, bc_detected) {
}else{
binmap <- data.table()
}

if(!is.null(opt$barcodes$barcode_sharing)){
share_table <- data.table::fread( opt$barcodes$barcode_sharing, header = F, skip = 1)
if(ncol(share_table) > 2){ #flatten table more if necessary
share_table <- data.table::melt(share_table, id.vars = "V1")[,variable := NULL]
}
setnames(share_table, c("main_bc","shared_bc"))

share_mode <- data.table::fread( opt$barcodes$barcode_sharing, header = F, nrows = 1)$V1
share_mode <- as.numeric(unlist(strsplit(gsub(pattern = "#",replacement = "", x = share_mode),"-")))

if(nrow(binmap)>0){ #first fix the noisy BC assignments so that they dont go into share barcodes
binmap[, partial_bc := substr(trueBC,start = share_mode[1], stop = share_mode[2])]
binmap[, partial_bc := substr(trueBC,start = share_mode[1], stop = share_mode[2])]
binmap <- merge(binmap, share_table, all.x = TRUE, by.x = "partial_bc", by.y = "shared_bc")
substr(binmap[!is.na(main_bc)]$trueBC,start = share_mode[1], stop = share_mode[2]) <- binmap[!is.na(main_bc)]$main_bc #replace to main barcode string
binmap[,c("partial_bc", "main_bc") := NULL]
}

#now check for merging detected barcodes
bc_detected[, partial_bc := substr(XC,start = share_mode[1], stop = share_mode[2])]
bc_detected <- merge(bc_detected, share_table, all.x = TRUE, by.x = "partial_bc", by.y = "shared_bc")
Expand All @@ -293,12 +297,12 @@ BCbin <- function(bccount_file, bc_detected) {
setnames(share_map,"XC","falseBC")
share_map[,c("partial_bc","main_bc","cellindex") := NULL][,hamming := 0]
share_map <- share_map[,c("falseBC","hamming","trueBC","n"), with = FALSE]

#messy move the shorter true bc list to the main R script
bc_detected <- bc_detected[is.na(main_bc)]
bc_detected[,c("partial_bc","main_bc") := NULL]
bccount <<- bc_detected

if(nrow(binmap)>0){
binmap <- rbind(binmap, share_map)
}else{
Expand Down
Loading

0 comments on commit 4e23bf9

Please sign in to comment.