-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathsummaries.R
412 lines (390 loc) · 14.5 KB
/
summaries.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#' Summarizing microbiome data
#'
#' To query a \code{SummarizedExperiment} for interesting features, several
#' functions are available.
#'
#' @inheritParams getPrevalence
#'
#' @param top \code{Numeric scalar}. Determines how many top taxa to return. Default is
#' to return top five taxa. (Default: \code{5})
#'
#' @param method \code{Character scalar}. Specify the method to determine top taxa. Either
#' sum, mean, median or prevalence. (Default: \code{"mean"})
#'
#' @param ... Additional arguments passed, e.g., to getPrevalence:
#' \itemize{
#' \item \code{sort}: \code{Logical scalar}. Specify
#' whether to sort taxa in alphabetical order or not. Enabled in functions
#' \code{getUnique}, and \code{getTop}.
#' (Default: \code{FALSE})
#' \item \code{na.rm}: \code{Logical scalar}. Specify
#' whether to remove missing values or not. Enabled in functions
#' \code{getUnique}, and \code{getTop}.
#' (Default: \code{FALSE})
#' }
#'
#' @details
#' The \code{getTop} extracts the most \code{top} abundant \dQuote{FeatureID}s
#' in a \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' object.
#'
#' The \code{getUnique} is a basic function to access different taxa at a
#' particular taxonomic rank.
#'
#' @return
#' The \code{getTop} returns a vector of the most \code{top} abundant
#' \dQuote{FeatureID}s
#'
#' @seealso
#' \code{\link[=getPrevalence]{getPrevalent}}
#'
#' @name summaries
#'
#' @examples
#' data(GlobalPatterns)
#' top_taxa <- getTop(GlobalPatterns,
#' method = "mean",
#' top = 5,
#' assay.type = "counts")
#' top_taxa
#'
#' # Use 'detection' to select detection threshold when using prevalence method
#' top_taxa <- getTop(GlobalPatterns,
#' method = "prevalence",
#' top = 5,
#' assay_name = "counts",
#' detection = 100)
#' top_taxa
#'
#' # Top taxa os specific rank
#' getTop(agglomerateByRank(GlobalPatterns,
#' rank = "Genus",
#' na.rm = TRUE))
#'
#' # Gets the overview of dominant taxa
#' dominant_taxa <- summarizeDominance(GlobalPatterns,
#' rank = "Genus")
#' dominant_taxa
#'
#' # With group, it is possible to group observations based on specified groups
#' # Gets the overview of dominant taxa
#' dominant_taxa <- summarizeDominance(GlobalPatterns,
#' rank = "Genus",
#' group = "SampleType",
#' na.rm = TRUE)
#'
#' dominant_taxa
#'
#' # Get an overview of sample and taxa counts
#' summary(GlobalPatterns, assay.type= "counts")
#'
#' # Get unique taxa at a particular taxonomic rank
#' # sort = TRUE means that output is sorted in alphabetical order
#' # With na.rm = TRUE, it is possible to remove NAs
#' # sort and na.rm can also be used in function getTop
#' getUnique(GlobalPatterns, "Phylum", sort = TRUE)
#'
NULL
#' @rdname summaries
#' @export
setGeneric("getTop", signature = "x",
function(x, top= 5L, method = c("mean","sum","median"),
assay.type = assay_name, assay_name = "counts",
na.rm = TRUE, ...)
standardGeneric("getTop"))
.check_max_taxa <- function(x, top, assay.type){
if(!is.numeric(top) || as.integer(top) != top){
stop("'top' must be integer value", call. = FALSE)
}
if(top > nrow(assay(x,assay.type))){
stop("'top' must be <= nrow(x)", call. = FALSE)
}
}
#' @rdname summaries
#'
#' @importFrom DelayedMatrixStats rowSums2 rowMeans2 rowMedians
#' @importFrom utils head
#'
#' @export
setMethod("getTop", signature = c(x = "SummarizedExperiment"),
function(x, top = 5L, method = c("mean","sum","median","prevalence"),
assay.type = assay_name, assay_name = "counts",
na.rm = TRUE, ...){
# input check
method <- match.arg(method, c("mean","sum","median","prevalence"))
# check max taxa
.check_max_taxa(x, top, assay.type)
# check assay
.check_assay_present(assay.type, x)
#
if(method == "prevalence"){
taxs <- getPrevalence(assay(x, assay.type), sort = TRUE,
include.lowest = TRUE, ...)
# If there are taxa with prevalence of 0, remove them
taxs <- taxs[ taxs > 0 ]
} else {
taxs <- switch(method,
mean = rowMeans2(assay(x, assay.type), na.rm = na.rm),
sum = rowSums2(assay(x, assay.type), na.rm = na.rm),
median = rowMedians(assay(x, assay.type)), na.rm = na.rm)
names(taxs) <- rownames(assay(x))
taxs <- sort(taxs,decreasing = TRUE)
}
names <- head(names(taxs), n = top)
# Remove NAs and sort if specified
names <- .remove_NAs_and_sort(names, ... )
return(names)
}
)
#' @rdname summaries
#'
#' @param rank \code{Character scalar}. Defines a taxonomic rank. Must be a value of
#' the output of \code{taxonomyRanks()}. (Default: \code{NULl})
#'
#' @return
#' The \code{getUnique} returns a vector of unique taxa present at a
#' particular rank
#'
#' @export
setGeneric("getUnique",
signature = c("x"),
function(x, ...)
standardGeneric("getUnique"))
#' @rdname summaries
#' @export
setMethod("getUnique", signature = c(x = "SummarizedExperiment"),
function(x, rank = NULL, ...){
.check_taxonomic_rank(rank, x)
names <- unique(rowData(x)[,rank])
# Remove NAs and sort if specified
names <- .remove_NAs_and_sort(names, ... )
return(names)
}
)
#' @rdname summaries
#'
#' @param group With group, it is possible to group the observations in an
#' overview. Must be one of the column names of \code{colData}.
#'
#' @param name \code{Character scalar}. A name for the column of the
#' \code{colData} where results will be stored. (Default: \code{"dominant_taxa"})
#'
#' @param ... Additional arguments passed on to \code{agglomerateByRank()} when
#' \code{rank} is specified for \code{summarizeDominance}.
#'
#'
#' @details
#' \code{summarizeDominance} returns information about most dominant
#' taxa in a tibble. Information includes their absolute and relative
#' abundances in whole data set.
#'
#'
#' @return
#' The \code{summarizeDominance} returns an overview in a tibble. It contains dominant taxa
#' in a column named \code{*name*} and its abundance in the data set.
#'
#' @export
setGeneric("summarizeDominance",signature = c("x"),
function(x, group = NULL, name = "dominant_taxa", ...)
standardGeneric("summarizeDominance"))
#' @rdname summaries
#' @export
setMethod("summarizeDominance", signature = c(x = "SummarizedExperiment"),
function(x, group = NULL, name = "dominant_taxa", ...){
# Input check
# group check
if(!is.null(group)){
if(isFALSE(any(group %in% colnames(colData(x))))){
stop("'group' variable must be in colnames(colData(x))",
call. = FALSE)
}
}
# name check
if(!.is_non_empty_string(name)){
stop("'name' must be a non-empty single character value.",
call. = FALSE)
}
# Adds dominant taxa to colData
dominant_taxa <- getDominant(x, ...)
data <- colData(x)
# If the length of dominant taxa is not equal to number of rows, then add rows
# because there are multiple dominant taxa
if(length(unlist(dominant_taxa)) > nrow(data) ){
# Get the order
order <- unique(names(dominant_taxa))
# there are multiple dominant taxa in one sample (counts are equal), length
# of dominant is greater than rows in colData. --> create a list that contain
# dominant taxa, and is as long as there are rows in colData
dominant_taxa_list <- split(dominant_taxa, rep(names(dominant_taxa), lengths(dominant_taxa)) )
# Order the data
dominant_taxa_list <- dominant_taxa_list[order]
data <- data[rep(seq_len(nrow(data)), lengths(dominant_taxa_list)), ]
}
# Add dominant taxa to data
colname <- "dominant_taxa"
data[[colname]] <- dominant_taxa
# Gets an overview
tab <- .tally_col_data(data, group, colname, ...)
colnames(tab)[colnames(tab) == colname] <- name
return(tab)
}
)
################################ HELP FUNCTIONS ################################
#' @importFrom S4Vectors as.data.frame
#' @importFrom dplyr n desc tally group_by arrange mutate
.tally_col_data <- function(data, group, colname, digits = NULL, ...){
# Convert data to data.frame
data <- as.data.frame(data)
# # If there are multiple dominant taxa in one sample, the column is a list.
# # Convert it so that there are multiple rows for sample and each row contains
# # one dominant taxa.
if( is.list(data[[colname]]) ){
# Get dominant taxa as a vector
dominant_taxa <- unlist(data[[colname]])
# Create additional rows
data <- data[rep(seq_len(nrow(data)), lengths(data[[colname]])), ]
# Add dominant taxa
data[[colname]] <- dominant_taxa
}
# Creates a tibble that contains number of times that a column of "name"
# is present in samples and relative portion of samples where they
# present.
# digits check
if(!is.null(digits)){
if(!is.numeric(digits)) {
stop("'digits' must be numeric", call. = FALSE)
}
}
if (is.null(group)) {
colname <- sym(colname)
data <- data %>%
group_by(!!colname)
} else {
group <- sym(group)
colname <- sym(colname)
data <- data %>%
group_by(!!group, !!colname)
}
tallied_data <- data %>%
tally() %>%
mutate(
rel_freq = n / sum(n)
) %>%
arrange(desc(n))
if(!is.null(digits)) {
tallied_data["rel_freq"] <- round(tallied_data["rel_freq"], digits)
}
return(tallied_data)
}
#' @rdname summaries
#'
#' @param object A
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} object.
#'
#' @param assay.type \code{Character scalar}. Specifies the name of the
#' assay used in calculation. (Default: \code{"counts"})
#'
#' @details
#' The \code{summary} will return a summary of counts for all samples and
#' features in
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' object.
#'
#' @return
#' The \code{summary} returns a list with two \code{tibble}s
#'
#' @seealso
#' \code{\link[scuttle:perCellQCMetrics]{perCellQCMetrics}},
#' \code{\link[scuttle:perFeatureQCMetrics]{perFeatureQCMetrics}},
#' \code{\link[scuttle:addPerCellQC]{addPerCellQC}},
#' \code{\link[scuttle:addPerFeatureQC]{addPerFeatureQC}},
#' \code{\link[scuttle:quickPerCellQC]{quickPerCellQC}}
#'
#' @export
setMethod("summary", signature = c(object = "SummarizedExperiment"),
function(object, assay.type = assay_name, assay_name = "counts"){
# Input check
.check_assay_present(assay.type = assay.type, object)
#
# check if NA in assay
.check_NAs_assay_counts(object, assay.type)
# check if counts
.check_fraction_or_negative_values(object, assay.type)
sample.summary <- .get_summary_col_data(object, assay.type)
feature.summary <- .get_summary_row_data(object, assay.type)
return(list("samples" = sample.summary, "features" = feature.summary))
}
)
################################ HELP FUNCTIONS summary ####################
#' @importFrom DelayedMatrixStats colSums2
#' @importFrom stats sd median
#' @importFrom tibble tibble
.get_summary_col_data <- function(x, assay.type){
# should check and extract assay
assay.x <- .get_assay(x, assay.type)
summary_col_data <- tibble(total_counts = sum(colSums2(assay.x)),
min_counts = min(colSums2(assay.x)),
max_counts = max(colSums2(assay.x)),
median_counts = median(colSums2(assay.x)),
mean_counts = mean(colSums2(assay.x)),
stdev_counts = sd(colSums2(assay.x)))
return(summary_col_data)
}
#' @importFrom DelayedMatrixStats colSums2
#' @importFrom tibble tibble
.get_summary_row_data <- function(x, assay.type){
# Should check and extract assay
# Internal from agglomerateByRanks
assay.x <- .get_assay(x, assay.type)
summary_row_data <- tibble(total = nrow(assay.x),
singletons = .get_singletons(assay.x),
per_sample_avg = mean(colSums2(assay.x != 0)))
return(summary_row_data)
}
# Get singletons from assay matrix
#' @importFrom DelayedMatrixStats rowSums2
#' @importFrom tibble tibble
.get_singletons<-function(x){
length(rowSums2(x)[rowSums2(x) == 1])
}
# Check if values in assay are fractions or or negative values
#' @importFrom DelayedMatrixStats colSums2
.check_fraction_or_negative_values <- function(x, assay.type){
assay.x <- .get_assay(x, assay.type)
if(any(colSums2(assay.x) < 1) | any(colSums2(assay.x) < 0)){
stop("There are samples that sum to 1 or less counts. ",
"Try to supply raw counts",
call. = FALSE)
}
}
# Remove NAs and order in alphabetical order
.remove_NAs_and_sort <- function(names, sort = FALSE, na.rm = FALSE, ...){
# Check sort
if( !.is_a_bool(sort) ){
stop("'sort' must be a boolean value.", call. = FALSE)
}
# Check na.rm
if( !.is_a_bool(na.rm) ){
stop("'na.rm' must be a boolean value.", call. = FALSE)
}
# Remove NAs if specified
if( na.rm ){
names <- names[ !is.na(names) ]
}
# Sort in alphabetical order if sort is TRUE
if( sort ){
names <- sort(names, na.last = TRUE)
}
return(names)
}
# Check NAs in assay, used when specifically when counts are expected
.check_NAs_assay_counts <- function(x, assay.type){
assay.x <- .get_assay(x, assay.type)
if(any(is.na(assay.x))) {
stop("There are samples with NAs in 'assay': ", assay.type,
" . This function is limited to sequencing data only. ",
"Where raw counts do not usually have NAs. ",
"Try to supply raw counts",
call. = FALSE)
}
}