2.9.4 UB tag write speed

sdparekh · Sep 12, 2020 · 4e23bf9 · 4e23bf9
1 parent 18c8933
commit 4e23bf9
Show file tree

Hide file tree

Showing 8 changed files with 214 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -31,6 +31,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
 zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.
 
 ## Changelog
+12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria. 
+
 19 July 2020: [zUMIs2.9.3](https://github.com/sdparekh/zUMIs/releases/tag/2.9.3): Add zUMIs version number to header of unmapped bam files. Several bug fixes: prevent error during mapping with memory handling; incorrect Smart-seq3 UMI-fragment counting.
 
 14 July 2020: [zUMIs2.9.2](https://github.com/sdparekh/zUMIs/releases/tag/2.9.2): Several bug fixes: Prevent RAM from ballooning, issues with resuming from different stage. Speed up demultiplexing further by chrosome-wise operations. Remove need for second bam file sorting after hamming collapse by keeping sort order.

diff --git a/UMIstuffFUN.R b/UMIstuffFUN.R
@@ -67,21 +67,21 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
   if(inex){
     taglist <- c(taglist, "GI")
   }
-  
+
   rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
     if(opt$read_layout == "PE"){
-      parms <- ScanBamParam(tag=taglist, 
-                            what="pos", 
+      parms <- ScanBamParam(tag=taglist,
+                            what="pos",
                             flag = scanBamFlag(isFirstMateRead = TRUE),
                             tagFilter = list(BC = chunk_bcs),
                             which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
     }else{
-      parms <- ScanBamParam(tag=taglist, 
-                            what="pos", 
+      parms <- ScanBamParam(tag=taglist,
+                            what="pos",
                             tagFilter = list(BC = chunk_bcs),
                             which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
     }
-    
+
     dat <- scanBam(file = featfile, param = parms)
     if(inex){
       dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
@@ -95,14 +95,14 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
     rsamtools_reads[ , ftype :="NA"][
        is.na(GEin)==F, ftype :="intron"][
        is.na(GE)==F  , ftype:="exon"][
-       is.na(GE)     , GE:=GEin][ 
+       is.na(GE)     , GE:=GEin][
                      ,GEin:=NULL ]
   }else{
     rsamtools_reads[, ftype :="NA"][
         is.na(GE)==F, ftype :="exon"]
   }
   setkey(rsamtools_reads,RG)
-  
+
   if(keepUnassigned){
     return( rsamtools_reads )
   }else{
@@ -299,7 +299,7 @@ collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
     names(ll$downsampling)<-subNames
     ll
   }, mc.cores = length(mapList))
-  
+
 }
 
 
@@ -334,6 +334,22 @@ write_molecule_mapping <- function(mm){
   }
 }
 
+correct_UB_tags_new <- function(inbamfile,n){
+  mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/",n,".")
+  outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
+  bcpath <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes_binned.txt")
+  use_threads <- opt$num_threads
+  pypath <- paste0(opt$zUMIs_directory,"/correct_UBtag.py")
+  UBcmd <- paste("python3", pypath,
+                 "--bam",inbamfile,
+                 "--out",outbamfile,
+                 "--p",use_threads,
+                 "--bcs",bcpath,
+                 "--stub",mm_path)
+  system(UBcmd)
+  return(outbamfile)
+}
+
 correct_UB_tags <- function(bccount, samtoolsexc){
   mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
   demux_path <- paste0(opt$out_dir,"/zUMIs_output/demultiplexed/")
@@ -365,46 +381,46 @@ demultiplex_bam <- function(opt, bamfile, nBCs, samtoolsexc, bccount){
   if(!dir.exists( paste0(opt$out_dir,"/zUMIs_output/demultiplexed/") )){
     dir.create( paste0(opt$out_dir,"/zUMIs_output/demultiplexed/") )
   }
-  
+
   installed_py <- try(system("pip freeze", intern = TRUE, ignore.stderr = TRUE), silent = TRUE)
   suppressWarnings(if(grepl('Error', installed_py)){
     installed_py <- try(system("pip3 freeze", intern = TRUE, ignore.stderr = TRUE), silent = TRUE)
   })
-  
+
   if(any(grepl("pysam==",installed_py))){
     print("Using python implementation to demultiplex.")
     print(Sys.time())
     max_filehandles <- as.numeric(system("ulimit -n", intern = TRUE))
     threads_perBC <- floor(max_filehandles/nBCs)
-    
+
     if(max_filehandles < nBCs | nBCs > 10000){
       #print("Warning! You cannot open enough filehandles for demultiplexing! Increase ulimit -n")
       #break up in several demultiplexing runs to avoid choke
       nchunks <- ifelse(max_filehandles < nBCs, no = ceiling(nBCs/10000), yes = ceiling(nBCs/(max_filehandles-100)))
       if(nchunks == 1){nchunks = 2}
       print(paste("Breaking up demultiplexing in",nchunks,"chunks. This may be because you have >10000 cells or a too low filehandle limit (ulimit -n)."))
-      
+
       full_bclist <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes.txt")
       bcsplit_prefix <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,"kept_barcodes.")
-      
+
       split_cmd <- paste0("split -a 3 -n l/",nchunks," ",full_bclist, " ", bcsplit_prefix)
       system(split_cmd)
       bclist <- list.files(path = paste0(opt$out_dir,"/zUMIs_output/"), pattern =  paste0(".",opt$project,"kept_barcodes."),all.files = TRUE, full.names = TRUE)
       header_cmd <- paste('sed -i -e \'1s/^/XC,n,cellindex\\n/\'', bclist[-1], collapse = '; ', sep = ' ')
       system(header_cmd)
-      
+
       if(max_filehandles < nBCs){threads_perBC <- 1}
     }else{
       bclist <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes.txt")
     }
-    
+
     if(threads_perBC > 2){
       threads_perBC <- 2
     }
     threads_decompress <- opt$num_threads - threads_perBC
     py_script <- paste0(opt$zUMIs_directory,"/misc/demultiplex_BC.py")
     print("Demultiplexing zUMIs bam file...")
-    
+
     if(threads_decompress > 10 & nBCs < 2500){ #if capacity is there, do demultiplexing parallelised per chromosome
       collect_demultiplex = TRUE #set a flag to remember to collect the output chunks later
       demux_cmd <- "sleep 1" #set a decoy system command
@@ -429,7 +445,7 @@ demultiplex_bam <- function(opt, bamfile, nBCs, samtoolsexc, bccount){
           , collapse = "; ")
         system(pysam_cmd)
       }, mc.cores = threads_chromosomes)
-      
+
     }else{
       collect_demultiplex = FALSE
       outstub <- paste0(opt$out_dir,"/zUMIs_output/demultiplexed/",opt$project,".")
@@ -443,7 +459,7 @@ demultiplex_bam <- function(opt, bamfile, nBCs, samtoolsexc, bccount){
         "--chr", "allreads"
         , collapse = "; ")
     }
-    
+
   }else{
     print("Using perl implementation to demultiplex.")
     demux_cmd <- paste0(opt$zUMIs_directory,"/misc/demultiplex_BC.pl ",opt$out_dir," ",opt$project, " ", bamfile, " ", samtoolsexc )
@@ -576,46 +592,46 @@ RPKM.calc <- function(exprmat, gene.length){
 
 .intronProbability<-function(featfile,bccount,inex,cores,samtoolsexc,saf, allC){
   print("Fetching reads from bam files again to calculate intron scores...")
-  
+
   nchunks <- length(unique(bccount$chunkID))
   all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")
-  
+
   for(i in unique(bccount$chunkID)){
     rgfile <- all_rgfiles[i]
     chunks <- bccount[chunkID==i]$XC
     write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
   }
-  
+
   headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
   write(headerXX,"freadHeader")
-  
+
   headercommand <- "cat freadHeader > "
   layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
   samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x UB -x EN -x IN -x GE -x GI", layoutflag, "-@")
-  
+
   outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
   system(paste(headercommand,outfiles,collapse = "; "))
-  
+
   cpusperchunk <- round(cores/nchunks,0)
-  
+
   grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/ES:Z://g' | sed 's/IS:Z://g' | grep -F -f "
   inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")
-  
+
   system(paste(inex_cmd,"wait"))
   system("rm freadHeader")
   system(paste("rm",all_rgfiles))
-  
+
   #continue with reading the data in
   for(i in unique(bccount$chunkID)){
     print(paste("Working on barcode chunk", i, "out of", length(unique(bccount$chunkID))))
     print(paste("Processing", length(bccount[chunkID == i]$XC), "barcodes in this chunk..."))
     samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",i,".txt")
-    
+
     reads<-data.table::fread(samfile, na.strings=c(""),
                              select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","ES","IS") )
     reads <- reads[ES == "Unassigned_NoFeatures" & IS == "Unassigned_NoFeatures"]
-    system(paste("rm",samfile))  
-    
+    system(paste("rm",samfile))
+
     scores_out<-.calculateProbabilityScores(reads = reads,
                                             saf = saf,
                                             bccount = bccount[chunkID == i],
@@ -642,8 +658,8 @@ RPKM.calc <- function(exprmat, gene.length){
 
   #get intronic bp per gene
   tmp<-merge(x=tmp,y=saf$intronsPerGene, by.x="GE", by.y="GeneID")
-  
-  
+
+
   dt<-merge(x=dt,y=tmp, by="RG", all.x=T)
 
 
@@ -660,4 +676,4 @@ check_read_layout <- function(bamfile){
   isbamPE <- suppressMessages(Rsamtools::testPairedEndBam(bamfile))
   found_read_layout <- ifelse(isbamPE == TRUE, "PE", "SE")
   return(found_read_layout)
-}
+}
diff --git a/barcodeIDFUN.R b/barcodeIDFUN.R
@@ -33,6 +33,10 @@ setDownSamplingOption<-function( down ,bccount, filename=NULL){
   }
   colnames(subsample.splits)<-c("minR","maxR")
 
+  if(nrow(subsample.splits) == 0){
+    subsample.splits <- setDownSamplingOption(down = "0", bccount = bccount, filename = filename)
+  }
+
   return( subsample.splits )
 }
 
@@ -236,7 +240,7 @@ BCbin <- function(bccount_file, bc_detected) {
                                                                             order(-n)][
                                                                             !( XC %in% true_BCs )   ]
   nocell_BCs <- nocell_bccount[,XC]
-  
+
   if(opt$barcodes$BarcodeBinning>0){
     #break up in pieces of 1000 real BCs in case the hamming distance calculation gets too large!
     true_chunks <- split(true_BCs, ceiling(seq_along(true_BCs)/1000))
@@ -266,24 +270,24 @@ BCbin <- function(bccount_file, bc_detected) {
   }else{
     binmap <- data.table()
   }
-  
+
   if(!is.null(opt$barcodes$barcode_sharing)){
     share_table <- data.table::fread( opt$barcodes$barcode_sharing, header = F, skip = 1)
     if(ncol(share_table) > 2){ #flatten table more if necessary
       share_table <- data.table::melt(share_table, id.vars = "V1")[,variable := NULL]
     }
     setnames(share_table, c("main_bc","shared_bc"))
-    
+
     share_mode <- data.table::fread( opt$barcodes$barcode_sharing, header = F, nrows = 1)$V1
     share_mode <- as.numeric(unlist(strsplit(gsub(pattern = "#",replacement = "", x = share_mode),"-")))
-    
+
     if(nrow(binmap)>0){ #first fix the noisy BC assignments so that they dont go into share barcodes
-      binmap[, partial_bc := substr(trueBC,start = share_mode[1], stop = share_mode[2])] 
+      binmap[, partial_bc := substr(trueBC,start = share_mode[1], stop = share_mode[2])]
       binmap <- merge(binmap, share_table, all.x = TRUE, by.x = "partial_bc", by.y = "shared_bc")
       substr(binmap[!is.na(main_bc)]$trueBC,start = share_mode[1], stop = share_mode[2]) <- binmap[!is.na(main_bc)]$main_bc #replace to main barcode string
       binmap[,c("partial_bc", "main_bc") := NULL]
     }
-    
+
     #now check for merging detected barcodes
     bc_detected[, partial_bc := substr(XC,start = share_mode[1], stop = share_mode[2])]
     bc_detected <- merge(bc_detected, share_table, all.x = TRUE, by.x = "partial_bc", by.y = "shared_bc")
@@ -293,12 +297,12 @@ BCbin <- function(bccount_file, bc_detected) {
     setnames(share_map,"XC","falseBC")
     share_map[,c("partial_bc","main_bc","cellindex") := NULL][,hamming := 0]
     share_map <- share_map[,c("falseBC","hamming","trueBC","n"), with = FALSE]
-    
+
     #messy move the shorter true bc list to the main R script
     bc_detected <- bc_detected[is.na(main_bc)]
     bc_detected[,c("partial_bc","main_bc") := NULL]
     bccount <<- bc_detected
-    
+
     if(nrow(binmap)>0){
       binmap <- rbind(binmap, share_map)
     }else{