bartongroup · fruce-ki · Jun 2, 2022 · Feb 8, 2022 · Jun 1, 2022 · Jun 1, 2022
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,4 @@
 .Rhistory
 .RData
 *.Rproj
-
 .DS_Store
-doc
-Meta
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: rats
-Version: 0.6.6
-Date: 2019-07-18
+Version: 0.6.7
+Date: 2022-05-30
 Title: Relative Abundance of Transcripts
 Encoding: UTF-8
 Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre]
@@ -22,13 +22,13 @@ Imports:
     ggplot2  (>= 2.2.0),
     rtracklayer,
     rhdf5,
-    wasabi, 
     GenomicRanges
 Suggests:
     ggbio,
     shiny,
     knitr,
+    rmarkdown,
     testthat
 URL: https://github.com/bartongroup/Rats, http://www.compbio.dundee.ac.uk
 BugReports: https://github.com/bartongroup/RATS/issues
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.2
diff --git a/Meta/vignette.rds b/Meta/vignette.rds
diff --git a/NAMESPACE b/NAMESPACE
@@ -30,7 +30,6 @@ import(matrixStats)
 import(parallel)
 import(rhdf5)
 import(utils)
-import(wasabi)
 importFrom(GenomicRanges,makeGRangesListFromDataFrame)
 importFrom(GenomicRanges,mcols)
 importFrom(parallel,detectCores)

diff --git a/R/input.R b/R/input.R
@@ -127,51 +127,43 @@ annot2models <- function(annotfile, threads= 1L)
 
 
 #================================================================================
-#' Import abundances directly from salmon and kallisto output.
+#' Import abundances directly from kallisto output.
 #'
-#' Convert Salmon read counts format to Kallisto RHDF5 format,
-#' then apply TPM normalisation using the info available from the abundance.h5 files.
+#' Apply TPM normalisation using the info available from the abundance.h5 files.
 #'
 #' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time.
 #' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications.
 #'
-#' \code{wasabi} automatically skips format conversion if a folder already contains an \code{abundance.h5} file.
-#'
 #' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate, without trailing path dividers. The directory name should be a unique identifier for the sample.
 #' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.
 #' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.
 #' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
-#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. FALSE can also be used to force wasabi to repeat the conversion even if it exists. (Default FALSE)
 #' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstrap} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
 #' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"})
 #' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})
 #' @param threads (integer) For parallel processing. (Default 1)
-#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format.
+#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic data input format, preferably for bootstrapped estimates (if bootstraps are available) or otherwise for plain count estimates.
 #'
 #' @import parallel
 #' @import data.table
 #' @import rhdf5
-#' @import wasabi
 #'
 #' @export
 
-fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000)
+fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", beartext=FALSE, threads= 1L, scaleto= 1000000)
 {
   threads <- as.integer(threads)  # Can't be decimal.
   setDTthreads(threads, restore_after_fork = TRUE)
 
   # Don't assume the annotation is ordered properly.
   annot <- tidy_annot(annot, TARGET_COL, PARENT_COL)
 
-  # Wasabi?
-  wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths), force= !half_cooked)
-
   # Sort out scaling factor per sample.
   lgth <- c("A"=length(A_paths), "B"=length(B_paths))
   sfac <- list("A"=NA_real_, "B"=NA_real_)
   if (length(scaleto) == 1) {     # uniform scaling
-    sfac$A <- rep(scaleto, lgth$A)
-    sfac$B <- rep(scaleto, lgth$B)
+    sfac$A <- rep(scaleto, lgth["A"])
+    sfac$B <- rep(scaleto, lgth["B"])
   } else {                        # individual scaling
     sfac$A <- scaleto[1:lgth$A]
     sfac$B <- scaleto[(1 + lgth$A):(lgth$A + lgth$B)]
@@ -201,32 +193,59 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
                                                                                                           # for consistency with the .h5 mode and to allow normalisation to other target sizes 
       # If data from Salmon/Wasabi or Kallisto abundance.h5...
       } else {
+        content <- rhdf5::h5ls(file.path(fil, "abundance.h5"))
         meta <- as.data.table(list( target_id=as.character(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/ids")), 
                                     eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) ))
-        counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
+
+        if ('bootstrap' %in% content$name) {
+          counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
+          # Normalise raw counts.
+          tpm <- as.data.table( lapply(counts, function (y) {
+            cpb <- y / meta$eff_length
+            tcpb <- sf / sum(cpb)
+            return(cpb * tcpb)
+          }) )
+          # Structure.
+          dt <- as.data.table( cbind(meta$target_id, tpm) )
+          with(dt, setkey(dt, V1) )
+          names(dt)[1] <- TARGET_COL
+          # Order transcripts to match annotation.
+          dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
+
+        } else {
+          counts <- rhdf5::h5read(file.path(fil, "abundance.h5"), "/est_counts")
+          # Normalize.
+          cpb <- counts / meta$eff_length
+          tcpb <- sf / sum(cpb)
+          tpm <- cpb * tcpb
+          # Structure
+          dt <- as.data.table( cbind(meta$target_id, tpm) )
+          with(dt, setkey(dt, V1) )
+          names(dt)[1] <- TARGET_COL
+          names(dt)[2] <- basename(fil)
+          # Order transcripts to match annotation.
+          dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
+        }
+
+        return (dt)
       }
-
-      # Normalise raw counts.
-      tpm <- as.data.table( lapply(counts, function (y) {
-        cpb <- y / meta$eff_length
-        tcpb <- sf / sum(cpb)
-        return(cpb * tcpb)
-      }) )
-
-      # Structure.
-      dt <- as.data.table( cbind(meta$target_id, tpm) )
-      with(dt, setkey(dt, V1) )
-      names(dt)[1] <- TARGET_COL
-      # Order transcripts to match annotation.
-      dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
-
       return (dt)
     }, mc.cores = min(threads,lgth[cond]), mc.preschedule = TRUE, mc.allow.recursive = FALSE)
 
+    # If single measurement for all samples, ie. est_counts instead of bootsraps, merge and unnest to meet un-bootsrapped format.
+    if( sum(vapply(boots, function(b){length(b)-1}, numeric(1))) == length(boots) ) {
+      boots <- Reduce(merge, boots)
+    }
+
     return (boots)
   })
 
-  names(res) <- c("boot_data_A", "boot_data_B")
+  if (is.data.table(res[[1]])) {
+    names(res) <- c("count_data_A", "count_data_B")
+  } else {
+    names(res) <- c("boot_data_A", "boot_data_B")
+  }
+
   return(res)
 }
 

diff --git a/README.md b/README.md
@@ -70,6 +70,9 @@ BiocManager::install("GenomicRanges")
 * Optional dependencies
 
 ```
+# Gene models
+BiocManager::install("ggbio")
+
 # Interactive volcano plot.
 install.packages("shiny")
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,4 @@ @@
     .Rhistory
     .RData
     *.Rproj
     .DS_Store
-    doc
-    Meta