-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathgetDominant.R
231 lines (222 loc) · 8.75 KB
/
getDominant.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#' Get dominant taxa
#'
#' These functions return information about the most dominant taxa in a
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' object.
#'
#' @inheritParams agglomerateByVariable
#' @inheritParams getPrevalence
#'
#' @param name \code{Character scalar}. A name for the column of the
#' \code{colData} where results will be stored.
#' (Default: \code{"dominant_taxa"})
#'
#' @param other.name \code{Character scalar}. A name for features that are not
#' included in n the most frequent dominant features in the data.
#' (Default: \code{"Other"})
#'
#' @param group \code{Character scalar}. Defines a group. Must be one of the
#' columns from \code{rowData(x)}. (Default: \code{NULL})
#'
#' @param rank Deprecated. Use \code{group} instead.
#'
#' @param n \code{Numeric scalar}. The number of features that are the most
#' frequent
#' dominant features. Default is NULL, which defaults that each sample is
#' assigned
#' a dominant taxon. (Default: \code{NULL})
#'
#' @param complete \code{Logical scalar}. A value to manage multiple dominant
#' taxa for a sample.
#' Default for getDominant is TRUE to include all equally dominant taxa
#' for each sample. complete = FALSE samples one taxa for the samples that have
#' multiple.
#' Default for addDominant is FALSE to add a column with only one
#' dominant taxon assigned for each sample into colData. complete = TRUE adds a
#' list that includes all dominant taxa for each sample into colData.
#'
#' @param ... Additional arguments passed on to \code{agglomerateByRank()} when
#' \code{rank} is specified.
#'
#' @details
#' \code{addDominant} extracts the most abundant taxa in a
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' object, and stores the information in the \code{colData}. It is a wrapper for
#' \code{getDominant}.
#'
#' With \code{group} parameter, it is possible to agglomerate rows based on
#' groups. If the value is one of the columns in \code{taxonomyRanks()},
#' \code{agglomerateByRank()} is applied. Otherwise,
#' \code{agglomerateByVariable()} is utilized.
#' E.g. if 'Genus' rank is used, all abundances of same Genus
#' are added together, and agglomerated features are returned.
#' See corresponding functions for additional arguments to deal with
#' missing values or special characters.
#'
#' @return \code{getDominant} returns a named character vector \code{x}
#' while \code{addDominant} returns
#' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#' with additional column in \code{\link{colData}} named \code{*name*}.
#'
#' @name getDominant
#' @export
#'
#' @examples
#' data(GlobalPatterns)
#' x <- GlobalPatterns
#'
#' # Finds the dominant taxa.
#' sim.dom <- getDominant(x, group = "Genus")
#'
#' # Add information to colData
#' x <- addDominant(x, group = "Genus", name ="dominant_genera")
#' colData(x)
NULL
#' @rdname getDominant
#' @export
setGeneric("getDominant",signature = c("x"),
function(x, assay.type = assay_name, assay_name = "counts",
group = rank, rank = NULL, other.name = "Other", n = NULL,
complete = TRUE, ...)
standardGeneric("getDominant"))
#' @rdname getDominant
#' @importFrom IRanges relist
#' @export
setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
function(x, assay.type = assay_name, assay_name = "counts", group = rank,
rank = NULL, other.name = "Other", n = NULL, complete = TRUE, ...){
# Input check
# Check assay.type
.check_assay_present(assay.type, x)
# group check
if(!is.null(group)){
if(!.is_a_string(group)){
stop("'group' must be an single character value.",
call. = FALSE)
}
}
# If "group" is not NULL, species are aggregated according to the
# taxonomic rank that is specified by user.
if (!is.null(group) && group %in% taxonomyRanks(x)) {
x <- agglomerateByRank(x, rank = group, ...)
# or factor that is specified by user
} else if (!is.null(group)) {
x <- agglomerateByVariable(x, by = "rows", group = group, ...)
}
# Get assay
mat <- assay(x, assay.type)
# apply() function finds the indices of taxa's that has the highest
# abundance.
# rownames() returns the names of taxa that are the most abundant.
idx <- as.list(apply(t(mat) == colMaxs(mat),1L,which))
# Get rownames based on indices
taxa <- rownames(mat)[unlist(idx)]
# If multiple dominant taxa were found, names contain taxa in addition
# to
# sample name. Names are converted so that they include only sample
# names.
names(taxa) <- rep( names(idx), times = lengths(idx) )
# If individual sample contains multiple dominant taxa (they have equal
# counts) and if complete is FALSE, the an arbitrarily chosen dominant
# taxa is returned
if( length(taxa)>ncol(x) && !complete){
# Store order
order <- unique(names(taxa))
# there are multiple dominant taxa in one sample (counts are equal),
# length
# of dominant is greater than rows in colData.
taxa <- split(taxa, rep(names(taxa), lengths(taxa)) )
# Order the data
taxa <- taxa[order]
names <- names(taxa)
# If complete is set FALSE, and there are multiple dominant taxa,
# one of them is arbitrarily chosen
taxa <- lapply(taxa, function(item) {
return(sample(item, 1)) })
taxa <- unname(unlist(lapply(taxa, function (x) {
unlist(x)})))
names(taxa) <- names
warning(
"Multiple dominant taxa were found for some samples. ",
"Use complete = TRUE for details.", call. = FALSE)
}
# Name "Other" the features that are not included in n the most abundant
# in the data
if(!is.null(n)){
flat_taxa <- unlist(taxa, recursive = TRUE)
top <- .top_n(flat_taxa, n=n)
top <- names(top)
# Group the rest into the "Other" category
taxa <- lapply(taxa, function(x){
ind <- vapply(x, function(y) y %in% top, FUN.VALUE = logical(1))
if( any(ind) ){
res <- x[ ind ]
} else{
res <- other.name
}
return(res)
})
if ( all(lengths(taxa) == 1 ) ){
taxa <- unlist(taxa)
}
}
return(taxa)
}
)
#' @rdname getDominant
#' @export
setGeneric("addDominant", signature = c("x"),
function(x, name = "dominant_taxa", other.name = "Other", n = NULL, ...)
standardGeneric("addDominant"))
#' @rdname getDominant
#' @export
setMethod("addDominant", signature = c(x = "SummarizedExperiment"),
function(x, name = "dominant_taxa", other.name = "Other", n = NULL,
complete = FALSE, ...) {
# name check
if(!.is_non_empty_string(name)){
stop("'name' must be a non-empty single character value.",
call. = FALSE)
}
# other.name check
if(!.is_non_empty_string(other.name)){
stop("'other.name' must be a non-empty single character value.",
call. = FALSE)
}
dom.taxa <- getDominant(
x, other.name = other.name, n = n, complete = complete, ...)
# Add list into colData if there are multiple dominant taxa
if(length(unique(names(dom.taxa))) < length(names(dom.taxa))) {
# Store order
order <- unique(names(dom.taxa))
grouped <- split(dom.taxa, rep(names(dom.taxa)), lengths(dom.taxa))
grouped <- grouped[order]
dom.taxa <- grouped
warning("A new column that was added in colData(x) is a list",
call. = FALSE)
}
colData(x)[[name]] <- dom.taxa
return(x)
}
)
########################## HELP FUNCTIONS summary ##############################
# top entries in a vector or given field in a data frame
.top_n <- function(x, n = NULL, na.rm = FALSE) {
if (na.rm){
inds <- which(x == "NA")
if (length(inds) > 0){
x[inds] <- NA
warning("Interpreting NA string as missing value NA. Removing",
length(inds), "entries", call. = FALSE)
}
x <- x[!is.na(x)]
}
# Create a frequency table of unique values of the dominant taxa for each
# sample
s <- rev(sort(table(x)))
# Include only n the most frequent taxa
if (!is.null(n)){
s <- s[seq_len(min(n, length(s)))]
}
return(s)
}