Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0.6.7 #70

Merged
merged 4 commits into from
Jun 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@
.Rhistory
.RData
*.Rproj

.DS_Store
doc
Meta
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: rats
Version: 0.6.6
Date: 2019-07-18
Version: 0.6.7
Date: 2022-05-30
Title: Relative Abundance of Transcripts
Encoding: UTF-8
Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre]
Expand All @@ -22,13 +22,13 @@ Imports:
ggplot2 (>= 2.2.0),
rtracklayer,
rhdf5,
wasabi,
GenomicRanges
Suggests:
ggbio,
shiny,
knitr,
rmarkdown,
testthat
URL: https://github.com/bartongroup/Rats, http://www.compbio.dundee.ac.uk
BugReports: https://github.com/bartongroup/RATS/issues
RoxygenNote: 6.1.1
RoxygenNote: 7.1.2
Binary file modified Meta/vignette.rds
Binary file not shown.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import(matrixStats)
import(parallel)
import(rhdf5)
import(utils)
import(wasabi)
importFrom(GenomicRanges,makeGRangesListFromDataFrame)
importFrom(GenomicRanges,mcols)
importFrom(parallel,detectCores)
Expand Down
81 changes: 50 additions & 31 deletions R/input.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,51 +127,43 @@ annot2models <- function(annotfile, threads= 1L)


#================================================================================
#' Import abundances directly from salmon and kallisto output.
#' Import abundances directly from kallisto output.
#'
#' Convert Salmon read counts format to Kallisto RHDF5 format,
#' then apply TPM normalisation using the info available from the abundance.h5 files.
#' Apply TPM normalisation using the info available from the abundance.h5 files.
#'
#' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time.
#' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications.
#'
#' \code{wasabi} automatically skips format conversion if a folder already contains an \code{abundance.h5} file.
#'
#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate, without trailing path dividers. The directory name should be a unique identifier for the sample.
#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.
#' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.
#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. FALSE can also be used to force wasabi to repeat the conversion even if it exists. (Default FALSE)
#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstrap} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
#' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"})
#' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})
#' @param threads (integer) For parallel processing. (Default 1)
#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format.
#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic data input format, preferably for bootstrapped estimates (if bootstraps are available) or otherwise for plain count estimates.
#'
#' @import parallel
#' @import data.table
#' @import rhdf5
#' @import wasabi
#'
#' @export

fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000)
fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", beartext=FALSE, threads= 1L, scaleto= 1000000)
{
threads <- as.integer(threads) # Can't be decimal.
setDTthreads(threads, restore_after_fork = TRUE)

# Don't assume the annotation is ordered properly.
annot <- tidy_annot(annot, TARGET_COL, PARENT_COL)

# Wasabi?
wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths), force= !half_cooked)

# Sort out scaling factor per sample.
lgth <- c("A"=length(A_paths), "B"=length(B_paths))
sfac <- list("A"=NA_real_, "B"=NA_real_)
if (length(scaleto) == 1) { # uniform scaling
sfac$A <- rep(scaleto, lgth$A)
sfac$B <- rep(scaleto, lgth$B)
sfac$A <- rep(scaleto, lgth["A"])
sfac$B <- rep(scaleto, lgth["B"])
} else { # individual scaling
sfac$A <- scaleto[1:lgth$A]
sfac$B <- scaleto[(1 + lgth$A):(lgth$A + lgth$B)]
Expand Down Expand Up @@ -201,32 +193,59 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
# for consistency with the .h5 mode and to allow normalisation to other target sizes
# If data from Salmon/Wasabi or Kallisto abundance.h5...
} else {
content <- rhdf5::h5ls(file.path(fil, "abundance.h5"))
meta <- as.data.table(list( target_id=as.character(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/ids")),
eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) ))
counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )

if ('bootstrap' %in% content$name) {
counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
# Normalise raw counts.
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / meta$eff_length
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )
# Structure.
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)

} else {
counts <- rhdf5::h5read(file.path(fil, "abundance.h5"), "/est_counts")
# Normalize.
cpb <- counts / meta$eff_length
tcpb <- sf / sum(cpb)
tpm <- cpb * tcpb
# Structure
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
names(dt)[2] <- basename(fil)
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
}

return (dt)
}

# Normalise raw counts.
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / meta$eff_length
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )

# Structure.
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)

return (dt)
}, mc.cores = min(threads,lgth[cond]), mc.preschedule = TRUE, mc.allow.recursive = FALSE)

# If single measurement for all samples, ie. est_counts instead of bootsraps, merge and unnest to meet un-bootsrapped format.
if( sum(vapply(boots, function(b){length(b)-1}, numeric(1))) == length(boots) ) {
boots <- Reduce(merge, boots)
}

return (boots)
})

names(res) <- c("boot_data_A", "boot_data_B")
if (is.data.table(res[[1]])) {
names(res) <- c("count_data_A", "count_data_B")
} else {
names(res) <- c("boot_data_A", "boot_data_B")
}

return(res)
}

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ BiocManager::install("GenomicRanges")
* Optional dependencies

```
# Gene models
BiocManager::install("ggbio")

# Interactive volcano plot.
install.packages("shiny")
```
Expand Down
Loading