-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathimportTaxpasta.R
190 lines (182 loc) · 6.62 KB
/
importTaxpasta.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#' Import taxpasta-specific BIOM results to
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#'
#' @name importTaxpasta
#'
#' @details
#' \code{importTaxpasta} imports data that is returned from Taxonomic Profile
#' Aggregation and Standardization (taxpasta) pipeline. See more information on
#' taxpasta from
#' \href{https://taxpasta.readthedocs.io/en/latest/}{taxpasta documentation}.
#'
#' @param file \code{Character scalar}. Defines the file path to a
#' BIOM file.
#' @param add.tree \code{Logical scalar}. Specifies whether to calculate
#' and add hierarchy tree using \code{\link{addHierarchyTree}}.
#' (Default: \code{TRUE})
#'
#' @param ... additional arguments
#' \itemize{
#' \item \code{set.ranks}: \code{Logical scalar}. Should column names of
#' taxonomy table be treated as taxonomy ranks? (Default: \code{FALSE})
#' }
#'
#' @return A
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#' object.
#'
#' @examples
#'
#' \dontrun{
#' # File path to BIOM file
#' file_path <- system.file("extdata", "complete.biom", package = "mia")
#' # Import BIOM as TreeSE, and set ranks.
#' tse <- importTaxpasta(file_path, set.ranks = TRUE)
#' # Import BIOM as TreeSE without adding hierarchy tree
#' tse <- importTaxpasta(file_path, add.tree = FALSE)
#' }
#'
#' @seealso
#' \code{\link[=importBIOM]{importBIOM}}
#' \code{\link[=convertFromBIOM]{convertFromBIOM}}
#'
NULL
#'
#' @rdname importTaxpasta
#' @export
#'
#' @importFrom SummarizedExperiment rowData
importTaxpasta <- function(file, add.tree = TRUE, ...) {
# Check dependencies.
.require_package("rhdf5")
.require_package("biomformat")
# Validate the input.
if(!.is_non_empty_string(file) ){
stop("'filename' must be a single non-empty character value.",
call. = FALSE)
}
if( !file.exists(file) ){
stop("'", file, "' not found.", call. = FALSE)
}
if (!.is_a_bool(add.tree)) {
stop("'add.tree' must be TRUE or FALSE.", call. = FALSE)
}
# We read our own HDF5 array to later be able to read observation group
# metadata, which [biomformat::read_biom()] currently doesn't do.
raw <- rhdf5::h5read(file, "/", read.attributes = TRUE)
biom <- .create_biom(raw)
# Convert BIOM to TreeSE
tse <- convertFromBIOM(biom)
# If we have taxonomic information, we add a hierarchy to the TreeSE.
if( !is.null(raw$observation$`group-metadata`$ranks) ){
# Get ranks from data
ranks <- .get_ranks(raw)
# Create rowData and rowTree
rowData(tse) <- .create_row_data(biom, ranks)
.set_ranks_based_on_rowdata(tse, ...)
if (add.tree) tse <- addHierarchyTree(tse)
# Agglomerate to all existing ranks
tse <- agglomerateByRanks(tse, update.tree = TRUE)
} else{
# Without taxonomic information, we return a simple TreeSE.
warning(
"The BIOM file does not contain taxonomy information;",
"unable to generate a taxonomic tree.", call. = FALSE)
}
return(tse)
}
#' Create BIOM object from raw HDF5 array
#'
#' \code{.create_biom} is an internal function used by
#' \code{\link[=importTaxpasta]{importTaxpasta}}, that more or less replicates
#' \code{\link[biomformat:read_hdf5_biom]{read_hdf5_biom}}.
#'
#' @param h5array A raw HDF5 array read from a BIOM file.
#'
#' @return A \code{\link[biomformat:biom]{biom}} object.
#'
#' @keywords internal
#' @noRd
#' @importFrom utils getFromNamespace
.create_biom <- function(h5array) {
# Get relevant functions from namespace. Using pkg:::function gives warnings
# in R CMD check. By doing like this, we can avoid them.
generate_matrix <- getFromNamespace("generate_matrix", "biomformat")
generate_metadata <- getFromNamespace("generate_metadata", "biomformat")
namedList <- getFromNamespace("namedList", "biomformat")
# Get abundance data, row data and column data along with shape
data <- generate_matrix(h5array)
rows <- generate_metadata(h5array$observation)
columns <- generate_metadata(h5array$sample)
shape <- c(length(data), length(data[[1]]))
# Get metadata on the data
id <- attr(h5array, "id")
vs <- attr(h5array, "format-version")
format <- sprintf("Biological Observation Matrix %s.%s", vs[1], vs[2])
format_url <- attr(h5array, "format-url")
type <- attr(h5array, "type")
# this bc biomformat does not allow empty 'type' field
type <- ifelse(type == "", "OTU table", type)
generated_by <- attr(h5array, "generated-by")
date <- attr(h5array, "creation-date")
matrix_type <- "dense"
matrix_element_type <- "int"
# Create BIOM object
result <- namedList(
id, format, format_url, type, generated_by, date, matrix_type,
matrix_element_type, rows, columns, shape, data
)
result <- biomformat::biom(result)
return(result)
}
#' Get taxonomic ranks from observation group metadata
#'
#' \code{.get_ranks} is an internal function used by
#' \code{\link[=importTaxpasta]{importTaxpasta}}, that reads the string of
#' semi-colon separated ranks from the group observation metadata.
#'
#' @param h5array A raw HDF5 array read from a BIOM file.
#'
#' @return A character vector of taxonomic ranks in order.
#'
#' @keywords internal
#' @noRd
.get_ranks <- function(h5array) {
ranks <- strsplit(h5array$observation$`group-metadata`$ranks,
";",
fixed = TRUE
)[[1]]
return(ranks)
}
#' Recreate observation metadata
#'
#' \code{.create_row_data} is an internal function used by
#' \code{\link[=importTaxpasta]{importTaxpasta}}, that returns a copy of the
#' BIOM object's observation metadata, where the headers of the taxonomy
#' columns have been replaced with the correct taxonomic rank names.
#'
#' @param biom A BIOM object.
#' @param ranks A character vector of taxonomic ranks in order.
#'
#' @return A \code{\link[base:data.frame]{data.frame}} with observation
#' metadata.
#'
#' @keywords internal
#' @noRd
.create_row_data <- function(biom, ranks) {
# Get taxonomy table
meta <- biomformat::observation_metadata(biom)
# Get column names of taxonomy table
column.names <- colnames(meta)
# Taxonomy ranks should start with "taxonomy", take all of them
indeces <- startsWith(column.names, "taxonomy")
# Check that they match with ranks got from the data
if( sum(indeces) != length(ranks) ){
stop(
"The number of generic taxonomy* columns differs",
"from the number of ranks.", call. = FALSE)
}
# And replace column names with the set of ranks
colnames(meta) <- replace(column.names, indeces, ranks)
return(meta)
}