Fix critical issue with mergeFeatures

Signed-off-by: Daena Rys <[email protected]>
microbiome · May 6, 2024 · 297aa81 · 297aa81
1 parent aaa55f2
commit 297aa81
Show file tree

Hide file tree

Showing 7 changed files with 267 additions and 240 deletions.
diff --git a/R/agglomerate.R b/R/agglomerate.R
@@ -1,6 +1,6 @@
 #' Agglomerate data using taxonomic information
 #'
-#' Agglomeration functions can be used to sum-up data based on specific criteria 
+#' Agglomeration functions can be used to sum-up data based on specific criteria
 #' such as taxonomic ranks, variables or prevalence.
 #'
 #' @param x a
@@ -26,29 +26,29 @@
 #' @param agglomerate.tree \code{TRUE} or \code{FALSE}: should
 #'   \code{rowTree()} also be agglomerated? (Default:
 #'   \code{agglomerate.tree = FALSE})
-#' 
+#'
 #' @param agglomerateTree alias for \code{agglomerate.tree}.
 #'
 #' @param ... arguments passed to \code{agglomerateByRank} function for
 #'   \code{SummarizedExperiment} objects,
-#'   to \code{getPrevalence} and \code{getPrevalentTaxa} and used in 
+#'   to \code{getPrevalence} and \code{getPrevalentTaxa} and used in
 #'   \code{agglomeratebyPrevalence},
 #'   to \code{\link[=merge-methods]{mergeRows}} and
 #'   \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}.
 #'   \itemize{
-#'        \item{\code{remove_empty_ranks}}{A single boolean value for selecting 
+#'        \item{\code{remove_empty_ranks}}{A single boolean value for selecting
 #'        whether to remove those columns of rowData that include only NAs after
 #'        agglomeration. (By default: \code{remove_empty_ranks = FALSE})}
-#'        \item{\code{make_unique}}{A single boolean value for selecting 
+#'        \item{\code{make_unique}}{A single boolean value for selecting
 #'        whether to make rownames unique. (By default: \code{make_unique = TRUE})}
-#'        \item{\code{detection}}{Detection threshold for absence/presence. 
-#'        Either an absolute value compared directly to the values of \code{x} 
+#'        \item{\code{detection}}{Detection threshold for absence/presence.
+#'        Either an absolute value compared directly to the values of \code{x}
 #'        or a relative value between 0 and 1, if \code{as_relative = FALSE}.}
-#'        \item{\code{prevalence}}{Prevalence threshold (in 0 to 1). The 
-#'        required prevalence is strictly greater by default. To include the 
+#'        \item{\code{prevalence}}{Prevalence threshold (in 0 to 1). The
+#'        required prevalence is strictly greater by default. To include the
 #'        limit, set \code{include_lowest} to \code{TRUE}.}
-#'        \item{\code{as.relative}}{Logical scalar: Should the detection 
-#'        threshold be applied on compositional (relative) abundances? 
+#'        \item{\code{as.relative}}{Logical scalar: Should the detection
+#'        threshold be applied on compositional (relative) abundances?
 #'        (default: \code{FALSE})}
 #'    }
 #'
@@ -66,14 +66,14 @@
 #' your results. If no loops exist (loops meaning two higher ranks containing
 #' the same lower rank), the results should be comparable. You can check for
 #' loops using \code{\link[TreeSummarizedExperiment:detectLoop]{detectLoop}}.
-#' 
+#'
 #' Agglomeration sums up the values of assays at the specified taxonomic level. With
 #' certain assays, e.g. those that include binary or negative values, this summing
 #' can produce meaningless values. In those cases, consider performing agglomeration
 #' first, and then applying the transformation afterwards.
 #'
-#' @return 
-#' \code{agglomerateByRank} returns a taxonomically-agglomerated, 
+#' @return
+#' \code{agglomerateByRank} returns a taxonomically-agglomerated,
 #'   optionally-pruned object of the same class as \code{x}.
 #'
 #' @name agglomerate-methods
@@ -92,16 +92,16 @@
 #' ## How many taxa before/after agglomeration?
 #' nrow(GlobalPatterns)
 #' nrow(x1)
-#' 
+#'
 #' # agglomerate the tree as well
 #' x2 <- agglomerateByRank(GlobalPatterns, rank="Family",
 #'                        agglomerate.tree = TRUE)
 #' nrow(x2) # same number of rows, but
 #' rowTree(x1) # ... different
 #' rowTree(x2) # ... tree
-#' 
+#'
 #'  # If assay contains binary or negative values, summing might lead to meaningless
-#'  # values, and you will get a warning. In these cases, you might want to do 
+#'  # values, and you will get a warning. In these cases, you might want to do
 #'  # agglomeration again at chosen taxonomic level.
 #'  tse <- transformAssay(GlobalPatterns, method = "pa")
 #'  tse <- agglomerateByRank(tse, rank = "Genus")
@@ -111,19 +111,19 @@
 #' sum(is.na(rowData(GlobalPatterns)$Family))
 #' x3 <- agglomerateByRank(GlobalPatterns, rank="Family", na.rm = TRUE)
 #' nrow(x3) # different from x2
-#' 
-#' # Because all the rownames are from the same rank, rownames do not include 
-#' # prefixes, in this case "Family:". 
+#'
+#' # Because all the rownames are from the same rank, rownames do not include
+#' # prefixes, in this case "Family:".
 #' print(rownames(x3[1:3,]))
-#' 
+#'
 #' # To add them, use getTaxonomyLabels function.
 #' rownames(x3) <- getTaxonomyLabels(x3, with_rank = TRUE)
 #' print(rownames(x3[1:3,]))
-#' 
+#'
 #' # use 'remove_empty_ranks' to remove columns that include only NAs
 #' x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", remove_empty_ranks = TRUE)
 #' head(rowData(x4))
-#' 
+#'
 #' # If the assay contains NAs, you might want to consider replacing them,
 #' # since summing-up NAs lead to NA
 #' x5 <- GlobalPatterns
@@ -135,7 +135,7 @@
 #' assay(x5)[ is.na(assay(x5)) ] <- 0
 #' x6 <- agglomerateByRank(x5, "Kingdom")
 #' head( assay(x6) )
-#' 
+#'
 #' ## Look at enterotype dataset...
 #' data(enterotype)
 #' ## Print the available taxonomic ranks. Shows only 1 available rank,
@@ -174,7 +174,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
                  call. = FALSE)
         }
         if(!.is_non_empty_string(rank)){
-            stop("'rank' must be an non empty single character value.",
+            stop("'rank' must be a non empty single character value.",
                 call. = FALSE)
         }
         if(!.is_a_bool(onRankOnly)){
@@ -189,7 +189,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
         .check_taxonomic_rank(rank, x)
         .check_for_taxonomic_data_order(x)
         #
-        
+
         # Make a vector from the taxonomic data.
         col <- which( taxonomyRanks(x) %in% rank )
         tax_cols <- .get_tax_cols_from_se(x)
@@ -226,6 +226,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
         # adjust rownames
         rownames(x) <- getTaxonomyLabels(x, empty.fields, ...,
                                         with_rank = FALSE, resolve_loops = FALSE)
+        rownames(x) <- sort(rownames(x))
         # Remove those columns from rowData that include only NAs
         x <- .remove_NA_cols_from_rowdata(x, ...)
         x <- .add_values_to_metadata(x, "agglomerated_by_rank", rank)
@@ -339,7 +340,7 @@ setMethod("mergeFeaturesByRank", signature = c(x = "TreeSummarizedExperiment"),
 .remove_NA_cols_from_rowdata <- function(x, remove_empty_ranks = FALSE, ...){
     # Check remove_empty_ranks
     if( !.is_a_bool(remove_empty_ranks) ){
-        stop("'remove_empty_ranks' must be a boolean value.", 
+        stop("'remove_empty_ranks' must be a boolean value.",
              call. = FALSE)
     }
     # If user wants to remove those columns
@@ -434,4 +435,4 @@ setMethod("mergeFeaturesByRank", signature = c(x = "TreeSummarizedExperiment"),
         tree <- drop.tip(tree, remove_index)
     }
     return(tree)
-}
+}