From ec0e4024b621a36a8d6ec1be9c4f4a7f7021e3f9 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Thu, 27 Feb 2025 20:12:44 -0500 Subject: [PATCH 1/2] WIP - all done except limma --- NAMESPACE | 4 + R/normalize.R | 843 ++++++++++++++++++++++++++++++++++++++++++- R/package_imports.R | 1 + R/zzz.R | 33 ++ man/norm_default.Rd | 45 +++ man/norm_l2.Rd | 46 +++ man/norm_library.Rd | 45 +++ man/norm_log.Rd | 44 +++ man/norm_osmfish.Rd | 54 +++ man/norm_pearson.Rd | 68 ++++ man/norm_quantile.Rd | 59 +++ man/norm_tfidf.Rd | 52 +++ man/processData.Rd | 55 +++ man/process_param.Rd | 59 +++ man/scale_default.Rd | 30 ++ man/scale_zscore.Rd | 40 ++ 16 files changed, 1474 insertions(+), 4 deletions(-) create mode 100644 man/norm_default.Rd create mode 100644 man/norm_l2.Rd create mode 100644 man/norm_library.Rd create mode 100644 man/norm_log.Rd create mode 100644 man/norm_osmfish.Rd create mode 100644 man/norm_pearson.Rd create mode 100644 man/norm_quantile.Rd create mode 100644 man/norm_tfidf.Rd create mode 100644 man/processData.Rd create mode 100644 man/process_param.Rd create mode 100644 man/scale_default.Rd create mode 100644 man/scale_zscore.Rd diff --git a/NAMESPACE b/NAMESPACE index cf94c401c..508e29ff6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,7 @@ export(addSpatialCentroidLocationsLayer) export(addStatistics) export(addVisiumPolygons) export(adjustGiottoMatrix) +export(adjustParam) export(aggregateStacks) export(aggregateStacksExpression) export(aggregateStacksLocations) @@ -286,6 +287,7 @@ export(makeSignMatrixDWLSfromMatrix) export(makeSignMatrixPAGE) export(makeSignMatrixRank) export(mergeClusters) +export(normParam) export(normalizeGiotto) export(objHistory) export(objName) @@ -394,6 +396,7 @@ export(sankeyRelate) export(sankeySet) export(sankeySetAddresses) export(saveGiotto) +export(scaleParam) export(screePlot) export(selectPatternGenes) export(setCellMetadata) @@ -523,6 +526,7 @@ import(ggplot2) import(methods) import(stats, except = density) import(utils) +importClassesFrom(Matrix,Matrix) importClassesFrom(data.table,data.table) importFrom(GiottoClass,"activeFeatType<-") importFrom(GiottoClass,"activeSpatUnit<-") diff --git a/R/normalize.R b/R/normalize.R index 47ca2eca5..c6085b504 100644 --- a/R/normalize.R +++ b/R/normalize.R @@ -1,3 +1,810 @@ +# Documentation #### +#' @name processData +#' @title Composable Data Processing +#' @description +#' Perform data transformations, or set up chains of transformations and +#' operations to be applied to matrix type data. `processData()` is a generic +#' for which methods can be defined off both `x` (the data to transform), +#' and `param` (the transform operation). +#' @param x data to transform +#' @param param S4 parameter class defining the transform operation and +#' params affecting it. +#' @param name character. [Object name][GiottoClass::giotto_schema] to assign +#' to the output. +#' @param \dots additional params to pass +#' @examples +#' m <- matrix(c(0, 0, 3, 2, 0, 5, 4, 0, 0, 1, 12, 0), nrow = 3) +#' +#' # single operation +#' lib_norm <- normParam("library") +#' lib_norm$scalefactor <- 5000 # alter a default param of library norm +#' processData(m, lib_norm) +#' +#' # chained operations +#' log_norm <- normParam("log") +#' zscore_cols <- scaleParam("zscore") +#' zscore_rows <- scaleParam("zscore", MARGIN = 1) +#' # this is essentially the same as the default giotto normalization +#' # only difference is the library norm scalefactor change. +#' processData(m, list(lib_norm, log_norm, zscore_cols, zscore_rows)) +#' @seealso [process_param] for processing operations that can be performed +#' through `processData()` +#' @md +NULL + +#' @name process_param +#' @title Data Processing Parameter Class Factories +#' @description Data processing operations in Giotto Suite can be divided into +#' normalization, scaling, and adjustments +#' @param method character. Name of method to use. See details. +#' @param \dots (optional) Additional named parameters relevant to the param +#' class. +#' @section normParam methods: +#' +#' * [`"default"`][norm_default] - default Giotto normalizations steps +#' (library + log norms) +#' * [`"library"`][norm_library] - library normalization +#' * [`"log"`][norm_log] - log normalization +#' * [`"osmfish"`][norm_osmfish] - osmfish normalization method +#' * [`"pearson"`][norm_pearson] - Lause/Kobak 2020 pearson residuals +#' normalization +#' * [`"quantile"`][norm_quantile] - quantile normalization +#' * [`"tf-idf"`][norm_tfidf] - Term Frequency-Inverse Document Frequency +#' * [`"l2"`][norm_l2] - L2 normalization (also known as Euclidean +#' normalization) +#' +#' @section scaleParam methods: +#' +#' * [`"default"`][scale_default] - default Giotto scaling steps (scale along +#' features then cells) +#' * [`"zscore"`][scale_zscore] - essentially the same as `base::scale()`, but +#' with a `MARGIN` param allowing scaling long either cols or rows +#' +#' @section adjustParam methods: +#' +#' * `"limma"` - limma batch correction +#' @md +NULL + +#' @name norm_default +#' @title Default Giotto Normalization +#' @description +#' Expression matrix normalization method. +#' +#' Steps: +#' +#' 1. [Total library size][norm_library] normalization and scaling by +#' a custom scale-factor. +#' 2. [Log][norm_log] transformation of data. +#' +#' @section params: +#' +#' \tabular{ll}{ +#' `library_size_norm` \tab logical (default = `TRUE`). whether to perform +#' library size normalization \cr +#' `scalefactor` \tab numeric (default = 6000). Scalefactor to use after +#' library size normalization. (skipped if `library_size_norm = FALSE`) \cr +#' `log_norm` \tab logical (default = `TRUE`). Whether to transform values to +#' log-scale. \cr +#' `log_offset` \tab numeric (default = 1). If `log_norm = TRUE`, offset +#' value to add to expression values to avoid `log(0)` \cr +#' `logbase` \tab numeric (default = 2). If `log_norm = TRUE`, log base to +#' use to log normalize expression values +#' } +#' @family normalization parameters +#' @seealso [process_param] +#' @md +NULL + +#' @name norm_library +#' @title Library Size Normalization +#' @description +#' Normalize expression matrix for total library size and then scale by +#' a custom scalefactor. +#' +#' This method does not work well when any cells/samples +#' have a library size of 0, so filtering prior to this is recommended. +#' +#' \deqn{\LARGE +#' x'_{i,j} = \frac{x_{i,j}}{\sum_{i} x_{i,j}} \times k +#' } +#' Where: +#' +#' * (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +#' * (\eqn{x'_{i,j}}) is the library normalized and scaled expression value for +#' feature \eqn{i} in sample \eqn{j} +#' * (k) is a scalefactor applied after normalization +#' +#' @section params: +#' +#' \tabular{ll}{ +#' `scalefactor` \tab numeric (default = 6000). Scalefactor to use after +#' library size normalization. Expressed as ***k*** in the above equation +#' } +#' @md +#' @family normalization parameters +#' @seealso [process_param] +NULL + +#' @name norm_log +#' @title Log Normalization +#' @description +#' Apply a log normalization +#' +#' \deqn{\LARGE +#' x'_{i,j} = \frac{\log(x_{i,j} + b)}{\log(a)} +#' } +#' Where: +#' +#' * (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +#' * (\eqn{x'_{i,j}}) is the log normalized expression value for feature +#' \eqn{i} in sample \eqn{j} +#' * (\eqn{a}) is the log base +#' * (\eqn{b}) is an offset value +#' +#' @section params: +#' +#' \tabular{ll}{ +#' `base` \tab numeric (default = 2) log base to use. Expressed as \eqn{a} in +#' the above equation. \cr +#' `offset` \tab numeric (default = 1). Offset to add to expression values to +#' avoid \eqn{\log(0)}. Expressed as \eqn{b} in the above equation. +#' } +#' @md +#' @family normalization parameters +#' @seealso [process_param] +NULL + +#' @name norm_osmfish +#' @title osmFISH Normalization +#' @description +#' Normalization method as provided by the osmFISH paper +#' +#' Steps: +#' +#' 1. First normalize genes, for each gene divide the counts by the total gene +#' count and multiply by the total number of genes. +#' 2. Next normalize cells, for each cell divide the normalized gene counts by +#' the total counts per cell and multiply by the total number of cells. +#' +#' \deqn{\LARGE +#' x'_{i,j} = \frac{x_{i,j}}{\sum_j x_{i,j}} \times n_{\text{features}} +#' } +#' +#' \deqn{\LARGE +#' x''_{i,j} = \frac{x'_{i,j}}{\sum_i x'_{i,j}} \times n_{\text{samples}} +#' } +#' +#' Where: +#' +#' * (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +#' * (\eqn{x'_{i,j}}) is the feature normalized expression value +#' * (\eqn{x''_{i,j}}) is the final normalized expression value after both +#' feature and cell normalization +#' * (\eqn{n_{\text{samples}}}) is the total number of cells +#' (columns in matrix) +#' * (\eqn{n_{\text{features}}}) is the total number of cells +#' (rows in matrix) +#' +#' @section params: +#' None +#' @md +#' @family normalization parameters +#' @seealso [process_param] +NULL + +#' @name norm_pearson +#' @title Lause/Kobak Pearson Residuals Normalization +#' @description +#' Calculate Pearson residuals with a dispersion adjustment, to identify cells +#' that deviate significantly from what would be expected under independence. +#' The normalization divides by the standard deviation of the difference, which +#' is adjusted by the dispersion parameter θ. +#' +#' This normalization is designed for detection of highly variable features and +#' dimension reduction and clustering. +#' +#' \deqn{\LARGE +#' z_{i,j} = \frac{x_{i,j} - \mu_{i,j}}{\sqrt{\mu_{i,j} + \mu_{i,j}^2 / \theta}} +#' } +#' +#' \deqn{\LARGE +#' \mu_{i,j} = \frac{r_i \cdot c_j}{N} +#' } +#' +#' Where: +#' * (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +#' * (\eqn{\mu_{i,j}}) is the expected value under the model +#' * (\eqn{r_i}) is \eqn{\sum_j x_{i,j}} +#' * (\eqn{c_j}) is \eqn{\sum_i x_{i,j}} +#' * (\eqn{N}) is \eqn{\sum_{i,j} x_{i,j}} +#' * (\eqn{\theta}) is a dispersion parameter +#' * (\eqn{z_{i,j}}) is the Pearson residual clipped to the range +#' \eqn{[-\sqrt{n}, \sqrt{n}]} where \eqn{n} is the number of columns. This is +#' done to prevent extreme values from dominating the analysis. +#' +#' # Note +#' Scaling is not recommended after this normalization since it is already +#' transforming the data to z-score-like values with a dispersion adjustment. +#' It is also not recommended to use this with DGE analysis. +#' +#' @section params: +#' +#' \tabular{ll}{ +#' `theta` \tab dispersion parameter expressed as \eqn{\theta} in the above +#' formula +#' } +#' +#' @references Lause, J., Berens, P. & Kobak, D. Analytic Pearson residuals for +#' normalization of single-cell RNA-seq UMI data. Genome Biol 22, 258 (2021). +#' https://doi.org/10.1186/s13059-021-02451-7 +#' @md +#' @family normalization parameters +#' @seealso [process_param] +NULL + +#' @name norm_quantile +#' @title Quantile Normalization +#' @description +#' Quantile normalization makes the statistical distribution of values in each +#' column identical by replacing the original values with the mean of the +#' values at the same rank across all columns. This removes technical variation +#' while preserving relative differences between features. +#' +#' Steps: +#' 1. Rank the values within each column (average taken in case of ties) +#' 2. Calculate the mean of values at the same rank across all columns +#' 3. Replace each value with the mean value corresponding to its rank +#' +#' \deqn{\LARGE +#' q_{i,j} = \bar{x}_{rank(i,j)} +#' } +#' +#' Where: +#' * (\eqn{rank(i,j)}) is the rank of feature \eqn{i} within column \eqn{j} +#' * (\eqn{\bar{x}_{r}}) where \eqn{r = rank(i,j)} is the mean of values with +#' rank \eqn{r} across all columns +#' * (\eqn{q_{i,j}}) is the quantile-normalized value +#' +#' # Note +#' Library normalization and log normalization is recommended prior to this +#' normalization. +#' +#' @section params: +#' None +#' +#' @references Bolstad, B.M., Irizarry, R.A., Astrand, M. et al. A comparison of +#' normalization methods for high density oligonucleotide array data based on +#' variance and bias. Bioinformatics 19, 185–193 (2003). +#' https://doi.org/10.1093/bioinformatics/19.2.185 +#' @md +#' @family normalization parameters +#' @seealso [process_param] +NULL + +#' @name norm_tfidf +#' @title TF-IDF Normalization +#' @description +#' TF-IDF (Term Frequency-Inverse Document Frequency) normalization is borrowed +#' from natural language processing to identify features that are highly expressed +#' in specific samples but not widely expressed across the entire dataset. +#' +#' \deqn{\LARGE +#' TF_{i,j} = \frac{x_{i,j}}{\sum_{i} x_{i,j}} +#' } +#' +#' \deqn{\LARGE +#' IDF_{i} = \log(1 + \frac{n_{samples}}{1 + n_{samples \: where \: feature \: i > 0}}) +#' } +#' +#' \deqn{\LARGE +#' TFIDF_{i,j} = TF_{i,j} \times IDF_{i} +#' } +#' +#' Where: +#' * (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +#' * (\eqn{TF_{i,j}}) is the term frequency of feature \eqn{i} in sample \eqn{j} +#' * (\eqn{IDF_{i}}) is the inverse document frequency of feature \eqn{i} +#' * (\eqn{TFIDF_{i,j}}) is the final TF-IDF normalized value +#' +#' # Note +#' [L2][norm_l2] normalization is commonly performed after TF-IDF normalization +#' +#' @section params: +#' None +#' @md +#' @family normalization parameters +#' @seealso [process_param] +NULL + +#' @name norm_l2 +#' @title L2 Normalization +#' @description +#' L2 normalization (also known as Euclidean normalization) scales each column +#' (sample) in the expression matrix to have unit Euclidean length. This +#' process makes samples with different sequencing depths more comparable and +#' improves the performance of distance-based analyses. +#' +#' \deqn{\LARGE +#' x'_{i,j} = \frac{x_{i,j}}{\sqrt{\sum_{i} x_{i,j}^2}} +#' } +#' +#' Where: +#' * (\eqn{x_{i,j}}) is the expression value for feature \eqn{i} in sample \eqn{j} +#' * (\eqn{x'_{i,j}}) is the L2-normalized expression value +#' +#' @section Note: +#' L2 normalization can be applied to raw data, but is most commonly used after +#' other normalization methods such as TF-IDF or log normalization to standardize +#' sample-to-sample comparisons. +#' +#' @section params: +#' None +#' +#' @family normalization parameters +#' @seealso [process_param] +#' @md +NULL + +#' @name scale_default +#' @title Default Giotto Scaling +#' @description +#' 2 step [z-scoring][scale_zscore] along features and samples +#' @section params: +#' +#' \tabular{ll}{ +#' `scale_feats` \tab logical (default = `TRUE`) Whether to scale across +#' features \cr +#' `scale_cells` \tab logical (default = `TRUE`) Whether to scale across +#' cells/samples \cr +#' `scale_order` \tab character. One of either `"first_feats"` or +#' `"first_cells"`. When both `scale_feats` and `scale_cells` are `TRUE`, +#' determines the order in which the 2 scaling operations are performed. \cr +#' `verbose` \tab logical (default = `TRUE`) Whether to be verbose +#' } +#' +#' @md +#' @family scaling parameters +#' @seealso [process_param] +NULL + +#' @name scale_zscore +#' @title Z Score Scaling +#' @description +#' Wrapper around `base::scale()` to make it compatible with the +#' [processData()] framework. Additionally provides a `MARGIN` param. +#' +#' \deqn{\LARGE +#' z_{i,j} = \frac{x_{i,j} - \mu_i}{\sigma_i} +#' } +#' +#' Where: +#' * \eqn{x_{i,j}} is the original value for feature \eqn{i} in sample \eqn{j} +#' * \eqn{\mu_i} is the mean of feature \eqn{i} across all samples +#' * \eqn{\sigma_i} is the standard deviation of feature \eqn{i} across all +#' samples +#' * \eqn{z_{i,j}} is the resulting scaled value +#' +#' @section params: +#' +#' \tabular{ll}{ +#' `scale` \tab logical (default = `TRUE`) Whether to scale values \cr +#' `center` \tab logical (default = `TRUE`) Whether to center values\cr +#' `MARGIN` \tab numeric. Either 1 (rows) or 2 (cols). Direction along which +#' to perform the operation. +#' } +#' @md +#' @family scaling parameters +#' @seealso [process_param] +NULL + + + + + +# VIRTUAL classes #### +setClass("normParam", contains = c("VIRTUAL", "processParam")) +setClass("scaleParam", contains = c("VIRTUAL", "processParam")) +setClass("adjustParam", contains = c("VIRTUAL", "processParam")) + +# access #### +.DollarNames.scaleParam <- function(x, pattern) { + names(x@param) +} +.DollarNames.normParam <- function(x, pattern) { + names(x@param) +} +.DollarNames.adjustParam <- function(x, pattern) { + names(x@param) +} + +# extending method classes #### +setClass("defaultNormParam", contains = "normParam") +setClass("libraryNormParam", contains = "normParam") +setClass("logNormParam", contains = "normParam") +setClass("osmFISHNormParam", contains = "normParam") +setClass("pearsonResidNormParam", contains = "normParam") +setClass("quantileNormParam", contains = "normParam") +setClass("tfidfNormParam", contains = "normParam") +setClass("l2NormParam", contains = "normParam") + +setClass("defaultScaleParam", contains = "scaleParam") +setClass("zscoreScaleParam", contains = "scaleParam") + +setClass("limmaAdjustParam", contains = "adjustParam") + +# allMatrix signature #### +setClassUnion("allMatrix", members = c("matrix", "Matrix")) + + +# params setup #### +.norm_param_lib <- function(...) { + p <- new("libraryNormParam", param = list(...)) + p$scalefactor <- p$scalefactor %null% 6e3 + p +} +.norm_param_log <- function(...) { + p <- new("logNormParam", param = list(...)) + p$base <- p$base %null% 2 + p$offset <- p$offset %null% 1 + p +} +.norm_param_osmfish <- function(...) { + new("osmFISHNormParam", param = list(...)) +} +.norm_param_pears_resid <- function(...) { + p <- new("pearsonResidNormParam", param = list(...)) + p$theta <- p$theta %null% 100 + p +} +.norm_param_quantile <- function(...) { + new("quantileNormParam", param = list(...)) +} +.norm_param_default <- function(...) { + p <- new("defaultNormParam", param = list(...)) + p$library_size_norm <- p$library_size_norm %null% TRUE + p$scalefactor <- p$scalefactor %null% 6e3 + p$log_norm <- p$log_norm %null% TRUE + p$log_offset <- p$log_offset %null% 1 + p$logbase <- p$logbase %null% 2 + p +} +.norm_param_tfidf <- function(...) { + new("tfidfNormParam", param = list(...)) +} +.norm_param_l2 <- function(...) { + new("l2NormParam", param = list(...)) +} + +.scale_param_zscore <- function(...) { + p <- new("zscoreScaleParam", param = list(...)) + p$scale <- p$scale %null% TRUE + p$center <- p$center %null% TRUE + p$MARGIN <- p$MARGIN %null% 2 + p +} +.scale_param_default <- function(...) { + p <- new("defaultScaleParam", param = list(...)) + p$scale_feats <- p$scale_feats %null% TRUE + p$scale_cells <- p$scale_cells %null% TRUE + p$scale_order <- p$scale_order %null% c("first_feats", "first_cells") + p$verbose <- p$verbose %null% TRUE + p +} + + +.adjust_param_limma <- function(...) { + p <- new("limmaAdjustParam", param = list(...)) + p@param <- if (is.null(p@param$batch_columns)) { + c(p@param, list(batch_columns = NULL)) + } + p@param <- if (is.null(p@param$covariate_columns)) { + c(p@param, list(covariate_columns = NULL)) + } + p +} + +# param factories #### + +#' @rdname process_param +#' @export +normParam <- function(method = "default", ...) { + method <- match.arg(tolower(method), + c("default", "library", "log", "osmfish", "pearson", "quantile", + "tf-idf", "l2") + ) + switch(method, + "default" = .norm_param_default(...), + "library" = .norm_param_lib(...), + "log" = .norm_param_log(...), + "osmfish" = .norm_param_osmfish(...), + "pearson" = .norm_param_pears_resid(...), + "quantile" = .norm_param_quantile(...), + "tf-idf" = .norm_param_tfidf(...), + "l2" = .norm_param_l2(...) + ) +} + +#' @rdname process_param +#' @export +scaleParam <- function(method = "default", ...) { + method <- match.arg(tolower(method), + c("default", "zscore") + ) + switch(method, + "default" = .scale_param_default(...), + "zscore" = .scale_param_zscore(...) + ) +} + +#' @rdname process_param +#' @export +adjustParam <- function(method = "limma", ...) { + method <- match.arg(tolower(method), + c("limma") + ) + switch(method, + "limma" = .adjust_param_limma(...) + ) +} + + + +# methods #### + +# * ANY #### + +setMethod("processData", +signature(x = "ANY", param = "ANY"), function(x, param) { + stop(wrap_txtf("param of class '%s' is not recognized for use with '%s'", + class(param), class(x)), + call. = FALSE) +}) + +setMethod("processData", + signature(x = "ANY", param = "adjustParam"), function(x, param) { + " " + }) + +# * exprObj #### + +#' @rdname processData +setMethod("processData", + signature(x = "exprObj", param = "list"), + function(x, param, name = "scaled") { + x[] <- processData(x[], param) + objName(x) <- name + return(x) + } +) + +#' @rdname processData +setMethod("processData", + signature(x = "exprObj", param = "normParam"), + function(x, param, name = "normalized") { + x[] <- processData(x[], param) + objName(x) <- name + return(x) + } +) + +# specialized handling for osmfish +setMethod("processData", + signature(x = "exprObj", param = "osmFISHNormParam"), + function(x, param, name = "custom") { + if (!featType(x) %in% c("rna", "RNA")) { + warning("Caution: osmFISH normalization was developed for RNA in situ data", + call. = FALSE) + } + x[] <- processData(x[], param) + objName(x) <- name + return(x) + } +) + +# specialized handling for pearson residual +setMethod("processData", + signature(x = "exprObj", param = "pearsonResidNormParam"), + function(x, param, name = "scaled") { + if (!featType(x) %in% c("rna", "RNA")) { + warning("Caution: pearson residual normalization was developed for RNA count normalization", + call. = FALSE) + } + x[] <- processData(x[], param) + objName(x) <- name + return(x) + } +) + +#' @rdname processData +setMethod("processData", + signature(x = "exprObj", param = "scaleParam"), + function(x, param, name = "scaled") { + x[] <- processData(x[], param) + objName(x) <- name + return(x) + } +) + + +# * matrix #### + +# ** param list #### + +#' @rdname processData +setMethod("processData", + signature(x = "allMatrix", param = "list"), + function(x, param) { + for (p in param) { + x <- processData(x, p) + } + return(x) + } +) + +# ** norm ------------------ #### +# *** library norm #### +setMethod("processData", + signature(x = "allMatrix", param = "libraryNormParam"), + function(x, param) { + .lib_norm_giotto(mymatrix = x, scalefactor = param$scalefactor) + } +) +# *** log norm #### +setMethod("processData", + signature(x = "allMatrix", param = "logNormParam"), + function(x, param) { + log(x + param$offset) / log(param$base) + } +) +setMethod("processData", + signature(x = "Matrix", param = "logNormParam"), + function(x, param) { + x@x <- log(x@x + param$offset) / log(param$base) + x + } +) +# *** osmFISH norm #### +setMethod("processData", + signature(x = "allMatrix", param = "osmFISHNormParam"), + function(x, param) { + # 1. normalize raw expr per gene with scale-factor equal to number of genes + norm_feats <- (x / rowSums_flex(x)) * nrow(x) + # 2. normalize per cells with scale-factor equal to number of cells + t_flex((t_flex(norm_feats) / colSums_flex(norm_feats)) * ncol(x)) + } +) +# *** pearson norm #### +setMethod("processData", + signature(x = "allMatrix", param = "pearsonResidNormParam"), + function(x, param) { + .pears_resid_citation(verbose = param$verbose) + .csums <- .csum_nodrop.Matrix + .rsums <- .rsum_nodrop.Matrix + .prnorm( + x = raw_expr[], + theta = param$theta, + .csums = .csums, + .rsums = .rsums + ) + } +) +# *** quantile norm #### +setMethod("processData", + signature(x = "allMatrix", param = "quantileNormParam"), + function(x, param) { + .qnorm(x) + } +) +# *** tf-idf norm #### +setMethod("processData", + signature(x = "allMatrix", param = "tfidfNormParam"), + function(x, param) { + # compute term frequency (TF) + tf <- x / rowSums_flex(x) + # compute inverse document frequency (IDF) + idf <- log(1 + ncol(x) / (1 + rowSums_flex(x > 0))) + # apply TF-IDF + tf * idf + } +) +# *** default norm #### +setMethod("processData", + signature(x = "allMatrix", param = "defaultNormParam"), + function(x, param) { + plist <- list() + # 1. library size normalization + if (isTRUE(param$library_size_norm)) { + plist <- c(plist, normParam("library", + scalefactor = param$scalefactor)) + } + # 2. log normalize + if (isTRUE(param$log_norm)) { + plist <- c(plist, normParam("log", + logbase = param$logbase, + log_offset = param$log_offset) + ) + } + processData(x, plist) + } +) +# *** L2 norm #### +setMethod("processData", + signature(x = "allMatrix", param = "l2NormParam"), + function(x, param) { + .l2_norm(x) + } +) + +# ** scale ----------------- #### +# *** zscore scale #### +setMethod("processData", + signature("allMatrix", param = "zscoreScaleParam"), + function(x, param, ...) { + if (!param$MARGIN %in% c(1, 2)) { + stop("processData zscore: 'MARGIN' must be either 1 (rows) or 2 (cols)", + call. = FALSE) + } + if (param$MARGIN == 1) x <- t_flex(x) + x <- standardise_flex(x, center = param$center, scale = param$scale) + if (param$MARGIN == 1) x <- t_flex(x) + return(x) + }) +# *** default scale #### +setMethod("processData", + signature(x = "allMatrix", param = "defaultScaleParam"), + function(x, param, ...) { + plist <- list() + s1 <-scaleParam("zscore", center = TRUE, scale = TRUE, MARGIN = 1) + s2 <-scaleParam("zscore", center = TRUE, scale = TRUE, MARGIN = 2) + if (isTRUE(param$scale_feats) && isTRUE(param$scale_cells)) { + scale_order <- match.arg(param$scale_order, + choices = c("first_feats", "first_cells") + ) + if (scale_order == "first_feats") { + vmsg(.v = param$verbose, "first scale feats and then cells") + plist <- c(plist, s1, s2) + } else if (scale_order == "first_cells") { + vmsg(.v = param$verbose, "first scale cells and then feats") + plist <- c(plist, s2, s1) + } else { + stop("processData defaultNormParam: scale order must be given", + call. = FALSE) + } + } else if (isTRUE(param$scale_feats)) { + plist <- c(plist, s1) + } else if (isTRUE(param$scale_cells)) { + plist <- c(plist, s2) + } + processData(x, plist) + } +) + + + + + +processExpression <- function(gobject, param, name, + expression_values = "raw", + spat_unit = NULL, + feat_type = NULL, + return_gobject = TRUE) { + ex <- getExpression(gobject, + values = expression_values, + spat_unit = spat_unit, + feat_type = feat_type, + output = "exprObj", + set_defaults = TRUE + ) + res <- processData(ex, param, name = name) + if(!isTRUE(return_gobject)) return(res) + setGiotto(gobject, res) +} + + + + + #' @title normalizeGiotto #' @name normalizeGiotto #' @description fast normalize and/or scale expression values of Giotto object @@ -186,8 +993,35 @@ normalizeGiotto <- function(gobject, + + + + + + + + + + + + # internals #### +.l2_norm <- function(x) { + # Calculate column norms (Euclidean length of each column) + col_norms <- sqrt(colSums_flex(x^2)) + # Avoid division by zero + col_norms[col_norms == 0] <- 1 + # Normalize each column + t_flex(t_flex(x) / col_norms) +} + +.pears_resid_citation <- function(verbose = NULL) { + vmsg(.v = verbose, "using 'Lause/Kobak' method to normalize count matrix. + If used in published research, please cite: + Jan Lause, Philipp Berens, Dmitry Kobak (2020). + 'Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data'") +} #' @title Normalize expression matrix for library size #' @param mymatrix matrix object @@ -200,10 +1034,11 @@ normalizeGiotto <- function(gobject, if (0 %in% libsizes) { warning(wrap_txt("Total library size or counts for individual spat - units are 0. - This will likely result in normalization problems. - filter (filterGiotto) or impute (imputeGiotto) spatial - units.")) + units are 0. + This will likely result in normalization problems. + filter (filterGiotto) or impute (imputeGiotto) spatial + units.") + ) } norm_expr <- t_flex(t_flex(mymatrix) / libsizes) * scalefactor diff --git a/R/package_imports.R b/R/package_imports.R index c9f96dd1e..912486b99 100644 --- a/R/package_imports.R +++ b/R/package_imports.R @@ -11,4 +11,5 @@ #' @importFrom data.table frank #' @importFrom data.table fread #' @importFrom data.table merge.data.table +#' @importClassesFrom Matrix Matrix NULL diff --git a/R/zzz.R b/R/zzz.R index 3c024dc1c..69af3ffdf 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -43,3 +43,36 @@ # ----------- # init_option("giotto.verbose", TRUE) } + +.onLoad <- function(libname, pkgname) { + # extensible classunions --------------------------------------------# + all_matrix <- c("matrix", "Matrix") + update_matrix_sig <- FALSE + if (requireNamespace("DelayedArray", quietly = TRUE)) { + getClass("DelayedArray") + all_matrix <- c(all_matrix, "DelayedArray") + update_matrix_sig <- TRUE + } + if (requireNamespace("dbMatrix", quietly = TRUE)) { + getClass("dbMatrix") + all_matrix <- c(all_matrix, "dbMatrix") + update_matrix_sig <- TRUE + } + + if (isTRUE(update_matrix_sig)) { + setClassUnion("allMatrix", members = all_matrix) + } + # methods extensions ------------------------------------------------# + + if (requireNamespace("dbMatrix", quietly = TRUE)) { + setMethod("processData", + + signature(x = "dbMatrix", param = "logNormParam"), + function(x, param) { + x[] <- dplyr::mutate(x[], x = x + param$offset) + # workaround for lack of @x slot + mymatrix <- log(mymatrix) / log(base) + } + ) +} +} diff --git a/man/norm_default.Rd b/man/norm_default.Rd new file mode 100644 index 000000000..79516df50 --- /dev/null +++ b/man/norm_default.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_default} +\alias{norm_default} +\title{Default Giotto Normalization} +\description{ +Expression matrix normalization method. + +Steps: +\enumerate{ +\item \link[=norm_library]{Total library size} normalization and scaling by +a custom scale-factor. +\item \link[=norm_log]{Log} transformation of data. +} +} +\section{params}{ + + +\tabular{ll}{ +\code{library_size_norm} \tab logical (default = \code{TRUE}). whether to perform +library size normalization \cr +\code{scalefactor} \tab numeric (default = 6000). Scalefactor to use after +library size normalization. (skipped if \code{library_size_norm = FALSE}) \cr +\code{log_norm} \tab logical (default = \code{TRUE}). Whether to transform values to +log-scale. \cr +\code{log_offset} \tab numeric (default = 1). If \code{log_norm = TRUE}, offset +value to add to expression values to avoid \code{log(0)} \cr +\code{logbase} \tab numeric (default = 2). If \code{log_norm = TRUE}, log base to +use to log normalize expression values +} +} + +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_l2}}, +\code{\link{norm_library}}, +\code{\link{norm_log}}, +\code{\link{norm_osmfish}}, +\code{\link{norm_pearson}}, +\code{\link{norm_quantile}}, +\code{\link{norm_tfidf}} +} +\concept{normalization parameters} diff --git a/man/norm_l2.Rd b/man/norm_l2.Rd new file mode 100644 index 000000000..43e7e28af --- /dev/null +++ b/man/norm_l2.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_l2} +\alias{norm_l2} +\title{L2 Normalization} +\description{ +L2 normalization (also known as Euclidean normalization) scales each column +(sample) in the expression matrix to have unit Euclidean length. This +process makes samples with different sequencing depths more comparable and +improves the performance of distance-based analyses. + +\deqn{\LARGE +x'_{i,j} = \frac{x_{i,j}}{\sqrt{\sum_{i} x_{i,j}^2}} +} + +Where: +\itemize{ +\item (\eqn{x_{i,j}}) is the expression value for feature \eqn{i} in sample \eqn{j} +\item (\eqn{x'_{i,j}}) is the L2-normalized expression value +} +} +\section{Note}{ + +L2 normalization can be applied to raw data, but is most commonly used after +other normalization methods such as TF-IDF or log normalization to standardize +sample-to-sample comparisons. +} + +\section{params}{ + +None +} + +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_default}}, +\code{\link{norm_library}}, +\code{\link{norm_log}}, +\code{\link{norm_osmfish}}, +\code{\link{norm_pearson}}, +\code{\link{norm_quantile}}, +\code{\link{norm_tfidf}} +} +\concept{normalization parameters} diff --git a/man/norm_library.Rd b/man/norm_library.Rd new file mode 100644 index 000000000..47d3f0a8b --- /dev/null +++ b/man/norm_library.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_library} +\alias{norm_library} +\title{Library Size Normalization} +\description{ +Normalize expression matrix for total library size and then scale by +a custom scalefactor. + +This method does not work well when any cells/samples +have a library size of 0, so filtering prior to this is recommended. + +\deqn{\LARGE +x'_{i,j} = \frac{x_{i,j}}{\sum_{i} x_{i,j}} \times k +} +Where: +\itemize{ +\item (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +\item (\eqn{x'_{i,j}}) is the library normalized and scaled expression value for +feature \eqn{i} in sample \eqn{j} +\item (k) is a scalefactor applied after normalization +} +} +\section{params}{ + + +\tabular{ll}{ +\code{scalefactor} \tab numeric (default = 6000). Scalefactor to use after +library size normalization. Expressed as \emph{\strong{k}} in the above equation +} +} + +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_default}}, +\code{\link{norm_l2}}, +\code{\link{norm_log}}, +\code{\link{norm_osmfish}}, +\code{\link{norm_pearson}}, +\code{\link{norm_quantile}}, +\code{\link{norm_tfidf}} +} +\concept{normalization parameters} diff --git a/man/norm_log.Rd b/man/norm_log.Rd new file mode 100644 index 000000000..e4b436fb5 --- /dev/null +++ b/man/norm_log.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_log} +\alias{norm_log} +\title{Log Normalization} +\description{ +Apply a log normalization + +\deqn{\LARGE +x'_{i,j} = \frac{\log(x_{i,j} + b)}{\log(a)} +} +Where: +\itemize{ +\item (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +\item (\eqn{x'_{i,j}}) is the log normalized expression value for feature +\eqn{i} in sample \eqn{j} +\item (\eqn{a}) is the log base +\item (\eqn{b}) is an offset value +} +} +\section{params}{ + + +\tabular{ll}{ +\code{base} \tab numeric (default = 2) log base to use. Expressed as \eqn{a} in +the above equation. \cr +\code{offset} \tab numeric (default = 1). Offset to add to expression values to +avoid \eqn{\log(0)}. Expressed as \eqn{b} in the above equation. +} +} + +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_default}}, +\code{\link{norm_l2}}, +\code{\link{norm_library}}, +\code{\link{norm_osmfish}}, +\code{\link{norm_pearson}}, +\code{\link{norm_quantile}}, +\code{\link{norm_tfidf}} +} +\concept{normalization parameters} diff --git a/man/norm_osmfish.Rd b/man/norm_osmfish.Rd new file mode 100644 index 000000000..c5abb3989 --- /dev/null +++ b/man/norm_osmfish.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_osmfish} +\alias{norm_osmfish} +\title{osmFISH Normalization} +\description{ +Normalization method as provided by the osmFISH paper + +Steps: +\enumerate{ +\item First normalize genes, for each gene divide the counts by the total gene +count and multiply by the total number of genes. +\item Next normalize cells, for each cell divide the normalized gene counts by +the total counts per cell and multiply by the total number of cells. +} + +\deqn{\LARGE +x'_{i,j} = \frac{x_{i,j}}{\sum_j x_{i,j}} \times n_{\text{features}} +} + +\deqn{\LARGE +x''_{i,j} = \frac{x'_{i,j}}{\sum_i x'_{i,j}} \times n_{\text{samples}} +} + +Where: +\itemize{ +\item (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +\item (\eqn{x'_{i,j}}) is the feature normalized expression value +\item (\eqn{x''_{i,j}}) is the final normalized expression value after both +feature and cell normalization +\item (\eqn{n_{\text{samples}}}) is the total number of cells +(columns in matrix) +\item (\eqn{n_{\text{features}}}) is the total number of cells +(rows in matrix) +} +} +\section{params}{ + +None +} + +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_default}}, +\code{\link{norm_l2}}, +\code{\link{norm_library}}, +\code{\link{norm_log}}, +\code{\link{norm_pearson}}, +\code{\link{norm_quantile}}, +\code{\link{norm_tfidf}} +} +\concept{normalization parameters} diff --git a/man/norm_pearson.Rd b/man/norm_pearson.Rd new file mode 100644 index 000000000..204d2cf60 --- /dev/null +++ b/man/norm_pearson.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_pearson} +\alias{norm_pearson} +\title{Lause/Kobak Pearson Residuals Normalization} +\description{ +Calculate Pearson residuals with a dispersion adjustment, to identify cells +that deviate significantly from what would be expected under independence. +The normalization divides by the standard deviation of the difference, which +is adjusted by the dispersion parameter θ. + +This normalization is designed for detection of highly variable features and +dimension reduction and clustering. + +\deqn{\LARGE +z_{i,j} = \frac{x_{i,j} - \mu_{i,j}}{\sqrt{\mu_{i,j} + \mu_{i,j}^2 / \theta}} +} + +\deqn{\LARGE +\mu_{i,j} = \frac{r_i \cdot c_j}{N} +} + +Where: +\itemize{ +\item (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +\item (\eqn{\mu_{i,j}}) is the expected value under the model +\item (\eqn{r_i}) is \eqn{\sum_j x_{i,j}} +\item (\eqn{c_j}) is \eqn{\sum_i x_{i,j}} +\item (\eqn{N}) is \eqn{\sum_{i,j} x_{i,j}} +\item (\eqn{\theta}) is a dispersion parameter +\item (\eqn{z_{i,j}}) is the Pearson residual clipped to the range +\eqn{[-\sqrt{n}, \sqrt{n}]} where \eqn{n} is the number of columns. This is +done to prevent extreme values from dominating the analysis. +} +} +\section{Note}{ +Scaling is not recommended after this normalization since it is already +transforming the data to z-score-like values with a dispersion adjustment. +It is also not recommended to use this with DGE analysis. +} + +\section{params}{ + + +\tabular{ll}{ +\code{theta} \tab dispersion parameter expressed as \eqn{\theta} in the above +formula +} +} + +\references{ +Lause, J., Berens, P. & Kobak, D. Analytic Pearson residuals for +normalization of single-cell RNA-seq UMI data. Genome Biol 22, 258 (2021). +https://doi.org/10.1186/s13059-021-02451-7 +} +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_default}}, +\code{\link{norm_l2}}, +\code{\link{norm_library}}, +\code{\link{norm_log}}, +\code{\link{norm_osmfish}}, +\code{\link{norm_quantile}}, +\code{\link{norm_tfidf}} +} +\concept{normalization parameters} diff --git a/man/norm_quantile.Rd b/man/norm_quantile.Rd new file mode 100644 index 000000000..0b07320e6 --- /dev/null +++ b/man/norm_quantile.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_quantile} +\alias{norm_quantile} +\title{Quantile Normalization} +\description{ +Quantile normalization makes the statistical distribution of values in each +column identical by replacing the original values with the mean of the +values at the same rank across all columns. This removes technical variation +while preserving relative differences between features. + +Steps: +\enumerate{ +\item Rank the values within each column (average taken in case of ties) +\item Calculate the mean of values at the same rank across all columns +\item Replace each value with the mean value corresponding to its rank +} + +\deqn{\LARGE +q_{i,j} = \bar{x}_{rank(i,j)} +} + +Where: +\itemize{ +\item (\eqn{rank(i,j)}) is the rank of feature \eqn{i} within column \eqn{j} +\item (\eqn{\bar{x}_{r}}) where \eqn{r = rank(i,j)} is the mean of values with +rank \eqn{r} across all columns +\item (\eqn{q_{i,j}}) is the quantile-normalized value +} +} +\section{Note}{ +Library normalization and log normalization is recommended prior to this +normalization. +} + +\section{params}{ + +None +} + +\references{ +Bolstad, B.M., Irizarry, R.A., Astrand, M. et al. A comparison of +normalization methods for high density oligonucleotide array data based on +variance and bias. Bioinformatics 19, 185–193 (2003). +https://doi.org/10.1093/bioinformatics/19.2.185 +} +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_default}}, +\code{\link{norm_l2}}, +\code{\link{norm_library}}, +\code{\link{norm_log}}, +\code{\link{norm_osmfish}}, +\code{\link{norm_pearson}}, +\code{\link{norm_tfidf}} +} +\concept{normalization parameters} diff --git a/man/norm_tfidf.Rd b/man/norm_tfidf.Rd new file mode 100644 index 000000000..2122b4923 --- /dev/null +++ b/man/norm_tfidf.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{norm_tfidf} +\alias{norm_tfidf} +\title{TF-IDF Normalization} +\description{ +TF-IDF (Term Frequency-Inverse Document Frequency) normalization is borrowed +from natural language processing to identify features that are highly expressed +in specific samples but not widely expressed across the entire dataset. + +\deqn{\LARGE +TF_{i,j} = \frac{x_{i,j}}{\sum_{i} x_{i,j}} +} + +\deqn{\LARGE +IDF_{i} = \log(1 + \frac{n_{samples}}{1 + n_{samples \: where \: feature \: i > 0}}) +} + +\deqn{\LARGE +TFIDF_{i,j} = TF_{i,j} \times IDF_{i} +} + +Where: +\itemize{ +\item (\eqn{x_{i,j}}) is the raw count for feature \eqn{i} in sample \eqn{j} +\item (\eqn{TF_{i,j}}) is the term frequency of feature \eqn{i} in sample \eqn{j} +\item (\eqn{IDF_{i}}) is the inverse document frequency of feature \eqn{i} +\item (\eqn{TFIDF_{i,j}}) is the final TF-IDF normalized value +} +} +\section{Note}{ +\link[=norm_l2]{L2} normalization is commonly performed after TF-IDF normalization +} + +\section{params}{ + +None +} + +\seealso{ +\link{process_param} + +Other normalization parameters: +\code{\link{norm_default}}, +\code{\link{norm_l2}}, +\code{\link{norm_library}}, +\code{\link{norm_log}}, +\code{\link{norm_osmfish}}, +\code{\link{norm_pearson}}, +\code{\link{norm_quantile}} +} +\concept{normalization parameters} diff --git a/man/processData.Rd b/man/processData.Rd new file mode 100644 index 000000000..aee07aaeb --- /dev/null +++ b/man/processData.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{processData} +\alias{processData} +\alias{processData,exprObj,list-method} +\alias{processData,exprObj,normParam-method} +\alias{processData,exprObj,scaleParam-method} +\alias{processData,allMatrix,list-method} +\title{Composable Data Processing} +\usage{ +\S4method{processData}{exprObj,list}(x, param, name = "scaled") + +\S4method{processData}{exprObj,normParam}(x, param, name = "normalized") + +\S4method{processData}{exprObj,scaleParam}(x, param, name = "scaled") + +\S4method{processData}{allMatrix,list}(x, param) +} +\arguments{ +\item{x}{data to transform} + +\item{param}{S4 parameter class defining the transform operation and +params affecting it.} + +\item{name}{character. \link[GiottoClass:giotto_schema]{Object name} to assign +to the output.} + +\item{\dots}{additional params to pass} +} +\description{ +Perform data transformations, or set up chains of transformations and +operations to be applied to matrix type data. \code{processData()} is a generic +for which methods can be defined off both \code{x} (the data to transform), +and \code{param} (the transform operation). +} +\examples{ +m <- matrix(c(0, 0, 3, 2, 0, 5, 4, 0, 0, 1, 12, 0), nrow = 3) + +# single operation +lib_norm <- normParam("library") +lib_norm$scalefactor <- 5000 # alter a default param of library norm +processData(m, lib_norm) + +# chained operations +log_norm <- normParam("log") +zscore_cols <- scaleParam("zscore") +zscore_rows <- scaleParam("zscore", MARGIN = 1) +# this is essentially the same as the default giotto normalization +# only difference is the library norm scalefactor change. +processData(m, list(lib_norm, log_norm, zscore_cols, zscore_rows)) +} +\seealso{ +\link{process_param} for processing operations that can be performed +through \code{processData()} +} diff --git a/man/process_param.Rd b/man/process_param.Rd new file mode 100644 index 000000000..cdeaf46fb --- /dev/null +++ b/man/process_param.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{process_param} +\alias{process_param} +\alias{normParam} +\alias{scaleParam} +\alias{adjustParam} +\title{Data Processing Parameter Class Factories} +\usage{ +normParam(method = "default", ...) + +scaleParam(method = "default", ...) + +adjustParam(method = "limma", ...) +} +\arguments{ +\item{method}{character. Name of method to use. See details.} + +\item{\dots}{(optional) Additional named parameters relevant to the param +class.} +} +\description{ +Data processing operations in Giotto Suite can be divided into +normalization, scaling, and adjustments +} +\section{normParam methods}{ + +\itemize{ +\item \code{\link[=norm_default]{"default"}} - default Giotto normalizations steps +(library + log norms) +\item \code{\link[=norm_library]{"library"}} - library normalization +\item \code{\link[=norm_log]{"log"}} - log normalization +\item \code{\link[=norm_osmfish]{"osmfish"}} - osmfish normalization method +\item \code{\link[=norm_pearson]{"pearson"}} - Lause/Kobak 2020 pearson residuals +normalization +\item \code{\link[=norm_quantile]{"quantile"}} - quantile normalization +\item \code{\link[=norm_tfidf]{"tf-idf"}} - Term Frequency-Inverse Document Frequency +\item \code{\link[=norm_l2]{"l2"}} - L2 normalization (also known as Euclidean +normalization) +} +} + +\section{scaleParam methods}{ + +\itemize{ +\item \code{\link[=scale_default]{"default"}} - default Giotto scaling steps (scale along +features then cells) +\item \code{\link[=scale_zscore]{"zscore"}} - essentially the same as \code{base::scale()}, but +with a \code{MARGIN} param allowing scaling long either cols or rows +} +} + +\section{adjustParam methods}{ + +\itemize{ +\item \code{"limma"} - limma batch correction +} +} + diff --git a/man/scale_default.Rd b/man/scale_default.Rd new file mode 100644 index 000000000..e262af4ce --- /dev/null +++ b/man/scale_default.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{scale_default} +\alias{scale_default} +\title{Default Giotto Scaling} +\description{ +2 step \link[=scale_zscore]{z-scoring} along features and samples +} +\section{params}{ + + +\tabular{ll}{ +\code{scale_feats} \tab logical (default = \code{TRUE}) Whether to scale across +features \cr +\code{scale_cells} \tab logical (default = \code{TRUE}) Whether to scale across +cells/samples \cr +\code{scale_order} \tab character. One of either \code{"first_feats"} or +\code{"first_cells"}. When both \code{scale_feats} and \code{scale_cells} are \code{TRUE}, +determines the order in which the 2 scaling operations are performed. \cr +\code{verbose} \tab logical (default = \code{TRUE}) Whether to be verbose +} +} + +\seealso{ +\link{process_param} + +Other scaling parameters: +\code{\link{scale_zscore}} +} +\concept{scaling parameters} diff --git a/man/scale_zscore.Rd b/man/scale_zscore.Rd new file mode 100644 index 000000000..08c740deb --- /dev/null +++ b/man/scale_zscore.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{scale_zscore} +\alias{scale_zscore} +\title{Z Score Scaling} +\description{ +Wrapper around \code{base::scale()} to make it compatible with the +\code{\link[=processData]{processData()}} framework. Additionally provides a \code{MARGIN} param. + +\deqn{\LARGE +z_{i,j} = \frac{x_{i,j} - \mu_i}{\sigma_i} +} + +Where: +\itemize{ +\item \eqn{x_{i,j}} is the original value for feature \eqn{i} in sample \eqn{j} +\item \eqn{\mu_i} is the mean of feature \eqn{i} across all samples +\item \eqn{\sigma_i} is the standard deviation of feature \eqn{i} across all +samples +\item \eqn{z_{i,j}} is the resulting scaled value +} +} +\section{params}{ + + +\tabular{ll}{ +\code{scale} \tab logical (default = \code{TRUE}) Whether to scale values \cr +\code{center} \tab logical (default = \code{TRUE}) Whether to center values\cr +\code{MARGIN} \tab numeric. Either 1 (rows) or 2 (cols). Direction along which +to perform the operation. +} +} + +\seealso{ +\link{process_param} + +Other scaling parameters: +\code{\link{scale_default}} +} +\concept{scaling parameters} From be6c343101946305a3dd52be4502d2017828dd91 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Fri, 28 Feb 2025 17:32:35 -0500 Subject: [PATCH 2/2] feat: limma via `processData()` and docs --- NAMESPACE | 4 + NEWS.md | 4 + R/normalize.R | 377 +++++++++++++++++++++++++++++---------- man/adjust_limma.Rd | 32 ++++ man/processData.Rd | 20 ++- man/processExpression.Rd | 75 ++++++++ man/process_param.Rd | 8 +- 7 files changed, 415 insertions(+), 105 deletions(-) create mode 100644 man/adjust_limma.Rd create mode 100644 man/processExpression.Rd diff --git a/NAMESPACE b/NAMESPACE index 508e29ff6..a0c754800 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,9 @@ S3method(.DollarNames,CosmxReader) S3method(.DollarNames,VisiumHDReader) S3method(.DollarNames,XeniumReader) +S3method(.DollarNames,adjustParam) +S3method(.DollarNames,normParam) +S3method(.DollarNames,scaleParam) export("%>%") export("activeFeatType<-") export("activeSpatUnit<-") @@ -339,6 +342,7 @@ export(polyStamp) export(preprocessImageToMatrix) export(print.combIcfObject) export(print.icfObject) +export(processExpression) export(processGiotto) export(prov) export(rankSpatialCorGroups) diff --git a/NEWS.md b/NEWS.md index ba8231671..7e713b6ed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,10 @@ * `importCosMx()` now supports vectors of filepaths when provided to `$load_images()` and `$load_polys()` * `importCosMx()` Selected FOVs are now selected in `plot()`. +## New +* `processExpression()` for `giotto` implemented via the `processData()` framework in {GiottoClass} v0.4.7 (see `?processData` and `?process_param`) +* `L2` and `TF-IDF` normalization methods accessible via the `processData()` framework + # Giotto 4.2.0 (2025/01/17) ## Breaking Changes diff --git a/R/normalize.R b/R/normalize.R index c6085b504..9cca5a20a 100644 --- a/R/normalize.R +++ b/R/normalize.R @@ -1,4 +1,48 @@ # Documentation #### + +#' @name processExpression +#' @title Expression Data Processing +#' @description +#' Perform data transformations, or set up chains of transformations and +#' operations to be applied to expression type data in the `giotto` object. +#' @param gobject `giotto` object +#' @inheritParams processData +#' @param expression_values character. Name of matrix to use +#' @param spat_unit character (optional). spatial unit to use +#' @param feat_type character (optional). feature type to use +#' @param return_gobject logical (optional). Whether to return the `gobject`. +#' When FALSE, the `exprObj` is returned instead. +#' @returns A `giotto` object when `return_gobject = TRUE`. Otherwise, an +#' `exprObj` +#' @seealso [process_param] for processing operations that can be performed +#' +#' [processData()] for the lower level generic handling these operations +#' @examples +#' g <- GiottoData::loadGiottoMini("visium") +#' # single operation +#' processExpression(g, normParam("library"), name = "library") +#' +#' # single operation with changed parameter +#' lib <- normParam("library") +#' lib$scalefactor = 1000 +#' processExpression(g, lib, name = "library2") +#' +#' # return the exprObj instead +#' processExpression(g, lib, name = "library2", return_gobject = FALSE) +#' +#' # chained operation (this is the Giotto standard normalization) +#' processExpression(g, +#' list( +#' normParam("library"), +#' normParam("log"), +#' scaleParam("zscore", MARGIN = 2), +#' scaleParam("zscore", MARGIN = 1) +#' ), +#' name = "scaled2" +#' ) +#' @md +NULL + #' @name processData #' @title Composable Data Processing #' @description @@ -8,7 +52,8 @@ #' and `param` (the transform operation). #' @param x data to transform #' @param param S4 parameter class defining the transform operation and -#' params affecting it. +#' params affecting it. Can also be a list of several of these objects, acting +#' as a pipeline. #' @param name character. [Object name][GiottoClass::giotto_schema] to assign #' to the output. #' @param \dots additional params to pass @@ -29,6 +74,9 @@ #' processData(m, list(lib_norm, log_norm, zscore_cols, zscore_rows)) #' @seealso [process_param] for processing operations that can be performed #' through `processData()` +#' @seealso [processExpression()] for the way to use this framework with the +#' `giotto` object +#' @returns The same class as `x` #' @md NULL @@ -62,7 +110,10 @@ NULL #' #' @section adjustParam methods: #' -#' * `"limma"` - limma batch correction +#' * [`"limma"`][adjust_limma] - limma batch correction +#' @seealso [processData()] for the generic used to apply these params +#' @seealso [processExpression()] for the way to use this framework with the +#' `giotto` object #' @md NULL @@ -398,8 +449,31 @@ NULL #' @seealso [process_param] NULL - - +#' @name adjust_limma +#' @title Limma Batch Correction +#' @description +#' Batch effect removal via [limma::removeBatchEffect()] +#' +#' @section params: +#' +#' \tabular{ll}{ +#' `batch_columns` \tab [svkey][GiottoClass::svkey()] (optional) Up to two +#' columns of information from a Giotto object with information indicating +#' batches to remove the effects of. \cr +#' `covariate_columns` \tab [svkey][GiottoClass::svkey()] (optional) Columns +#' of information from a Giotto object with information indicating covariates +#' to regress out. +#' } +#' @examples +#' limma <- adjustParam("limma") +#' limma$covariate_columns <- svkey(feats = c("nr_feats", "total_expr")) +#' +#' g <- GiottoData::loadGiottoMini("visium") +#' processExpression(g, limma, name = "limma") +#' @family adjustment parameters +#' @seealso [process_param] +#' @md +NULL # VIRTUAL classes #### @@ -408,12 +482,15 @@ setClass("scaleParam", contains = c("VIRTUAL", "processParam")) setClass("adjustParam", contains = c("VIRTUAL", "processParam")) # access #### +#' @export .DollarNames.scaleParam <- function(x, pattern) { names(x@param) } +#' @export .DollarNames.normParam <- function(x, pattern) { names(x@param) } +#' @export .DollarNames.adjustParam <- function(x, pattern) { names(x@param) } @@ -437,72 +514,7 @@ setClass("limmaAdjustParam", contains = "adjustParam") setClassUnion("allMatrix", members = c("matrix", "Matrix")) -# params setup #### -.norm_param_lib <- function(...) { - p <- new("libraryNormParam", param = list(...)) - p$scalefactor <- p$scalefactor %null% 6e3 - p -} -.norm_param_log <- function(...) { - p <- new("logNormParam", param = list(...)) - p$base <- p$base %null% 2 - p$offset <- p$offset %null% 1 - p -} -.norm_param_osmfish <- function(...) { - new("osmFISHNormParam", param = list(...)) -} -.norm_param_pears_resid <- function(...) { - p <- new("pearsonResidNormParam", param = list(...)) - p$theta <- p$theta %null% 100 - p -} -.norm_param_quantile <- function(...) { - new("quantileNormParam", param = list(...)) -} -.norm_param_default <- function(...) { - p <- new("defaultNormParam", param = list(...)) - p$library_size_norm <- p$library_size_norm %null% TRUE - p$scalefactor <- p$scalefactor %null% 6e3 - p$log_norm <- p$log_norm %null% TRUE - p$log_offset <- p$log_offset %null% 1 - p$logbase <- p$logbase %null% 2 - p -} -.norm_param_tfidf <- function(...) { - new("tfidfNormParam", param = list(...)) -} -.norm_param_l2 <- function(...) { - new("l2NormParam", param = list(...)) -} - -.scale_param_zscore <- function(...) { - p <- new("zscoreScaleParam", param = list(...)) - p$scale <- p$scale %null% TRUE - p$center <- p$center %null% TRUE - p$MARGIN <- p$MARGIN %null% 2 - p -} -.scale_param_default <- function(...) { - p <- new("defaultScaleParam", param = list(...)) - p$scale_feats <- p$scale_feats %null% TRUE - p$scale_cells <- p$scale_cells %null% TRUE - p$scale_order <- p$scale_order %null% c("first_feats", "first_cells") - p$verbose <- p$verbose %null% TRUE - p -} - -.adjust_param_limma <- function(...) { - p <- new("limmaAdjustParam", param = list(...)) - p@param <- if (is.null(p@param$batch_columns)) { - c(p@param, list(batch_columns = NULL)) - } - p@param <- if (is.null(p@param$covariate_columns)) { - c(p@param, list(covariate_columns = NULL)) - } - p -} # param factories #### @@ -555,24 +567,19 @@ adjustParam <- function(method = "limma", ...) { # * ANY #### setMethod("processData", -signature(x = "ANY", param = "ANY"), function(x, param) { +signature(x = "ANY", param = "ANY"), function(x, param, ...) { stop(wrap_txtf("param of class '%s' is not recognized for use with '%s'", class(param), class(x)), call. = FALSE) }) -setMethod("processData", - signature(x = "ANY", param = "adjustParam"), function(x, param) { - " " - }) - # * exprObj #### #' @rdname processData setMethod("processData", signature(x = "exprObj", param = "list"), - function(x, param, name = "scaled") { - x[] <- processData(x[], param) + function(x, param, name = "scaled", ...) { + x[] <- processData(x[], param, ...) objName(x) <- name return(x) } @@ -581,8 +588,18 @@ setMethod("processData", #' @rdname processData setMethod("processData", signature(x = "exprObj", param = "normParam"), - function(x, param, name = "normalized") { - x[] <- processData(x[], param) + function(x, param, name = "normalized", ...) { + x[] <- processData(x[], param, ...) + objName(x) <- name + return(x) + } +) + +#' @rdname processData +setMethod("processData", + signature(x = "exprObj", param = "adjustParam"), + function(x, param, name = "custom", ...) { + x[] <- processData(x[], param, ...) objName(x) <- name return(x) } @@ -591,12 +608,12 @@ setMethod("processData", # specialized handling for osmfish setMethod("processData", signature(x = "exprObj", param = "osmFISHNormParam"), - function(x, param, name = "custom") { + function(x, param, name = "custom", ...) { if (!featType(x) %in% c("rna", "RNA")) { warning("Caution: osmFISH normalization was developed for RNA in situ data", call. = FALSE) } - x[] <- processData(x[], param) + x[] <- processData(x[], param, ...) objName(x) <- name return(x) } @@ -605,12 +622,12 @@ setMethod("processData", # specialized handling for pearson residual setMethod("processData", signature(x = "exprObj", param = "pearsonResidNormParam"), - function(x, param, name = "scaled") { + function(x, param, name = "scaled", ...) { if (!featType(x) %in% c("rna", "RNA")) { warning("Caution: pearson residual normalization was developed for RNA count normalization", call. = FALSE) } - x[] <- processData(x[], param) + x[] <- processData(x[], param, ...) objName(x) <- name return(x) } @@ -619,8 +636,8 @@ setMethod("processData", #' @rdname processData setMethod("processData", signature(x = "exprObj", param = "scaleParam"), - function(x, param, name = "scaled") { - x[] <- processData(x[], param) + function(x, param, name = "scaled", ...) { + x[] <- processData(x[], param, ...) objName(x) <- name return(x) } @@ -634,9 +651,9 @@ setMethod("processData", #' @rdname processData setMethod("processData", signature(x = "allMatrix", param = "list"), - function(x, param) { + function(x, param, ...) { for (p in param) { - x <- processData(x, p) + x <- processData(x, p, ...) } return(x) } @@ -646,20 +663,20 @@ setMethod("processData", # *** library norm #### setMethod("processData", signature(x = "allMatrix", param = "libraryNormParam"), - function(x, param) { + function(x, param, ...) { .lib_norm_giotto(mymatrix = x, scalefactor = param$scalefactor) } ) # *** log norm #### setMethod("processData", signature(x = "allMatrix", param = "logNormParam"), - function(x, param) { + function(x, param, ...) { log(x + param$offset) / log(param$base) } ) setMethod("processData", signature(x = "Matrix", param = "logNormParam"), - function(x, param) { + function(x, param, ...) { x@x <- log(x@x + param$offset) / log(param$base) x } @@ -667,7 +684,7 @@ setMethod("processData", # *** osmFISH norm #### setMethod("processData", signature(x = "allMatrix", param = "osmFISHNormParam"), - function(x, param) { + function(x, param, ...) { # 1. normalize raw expr per gene with scale-factor equal to number of genes norm_feats <- (x / rowSums_flex(x)) * nrow(x) # 2. normalize per cells with scale-factor equal to number of cells @@ -677,7 +694,7 @@ setMethod("processData", # *** pearson norm #### setMethod("processData", signature(x = "allMatrix", param = "pearsonResidNormParam"), - function(x, param) { + function(x, param, ...) { .pears_resid_citation(verbose = param$verbose) .csums <- .csum_nodrop.Matrix .rsums <- .rsum_nodrop.Matrix @@ -692,14 +709,14 @@ setMethod("processData", # *** quantile norm #### setMethod("processData", signature(x = "allMatrix", param = "quantileNormParam"), - function(x, param) { + function(x, param, ...) { .qnorm(x) } ) # *** tf-idf norm #### setMethod("processData", signature(x = "allMatrix", param = "tfidfNormParam"), - function(x, param) { + function(x, param, ...) { # compute term frequency (TF) tf <- x / rowSums_flex(x) # compute inverse document frequency (IDF) @@ -711,7 +728,7 @@ setMethod("processData", # *** default norm #### setMethod("processData", signature(x = "allMatrix", param = "defaultNormParam"), - function(x, param) { + function(x, param, ...) { plist <- list() # 1. library size normalization if (isTRUE(param$library_size_norm)) { @@ -725,13 +742,13 @@ setMethod("processData", log_offset = param$log_offset) ) } - processData(x, plist) + processData(x, plist, ...) } ) # *** L2 norm #### setMethod("processData", signature(x = "allMatrix", param = "l2NormParam"), - function(x, param) { + function(x, param, ...) { .l2_norm(x) } ) @@ -781,14 +798,66 @@ setMethod("processData", ) +# ** adjust #### +# *** limma #### +setMethod("processData", + signature(x = "allMatrix", param = "limmaAdjustParam"), + function(x, param, context = NULL, ...) { + package_check("limma") + if (is.null(context)) { + c( + "limma adjustment: `context` arg should be a gobject", + "containing the columns to use for batches and/or covariates", + "information." + ) %>% + wrap_txt(errWidth = TRUE) %>% + stop(call. = FALSE) + } + batches <- param$batch_columns + covariates <- param$covariate_columns + if (is.null(batches) && is.null(covariates)) { + "limma adjustment: At least one of `batch_columns` or + `covariate_columns` must be provided." %>% + wrap_txt() %>% + stop(call. = FALSE) + } + sample_order <- colnames(x) + limma_args <- list(x = x, ...) + # batches + if (!is.null(batches)) { + b_dt <- .get_svkey(batches, context, sample_order = sample_order) + if (ncol(b_dt > 2)) { + "max of 2 columns are allowed for 'batch_columns'" %>% + stop(call. = FALSE) + } else { + limma_args$batch <- b_dt[[1]] + if (ncol(b_dt == 2)) { + limma_args$batch2 <- b_dt[[2]] + } + } + } + # covariates + if (!is.null(covariates)) { + c_dt <- .get_svkey(covariates, context, + sample_order = sample_order) + limma_args$covariates <- as.matrix(c_dt) + } + do.call(limma::removeBatchEffect, args = limma_args) %>% + as("Matrix") + }) + + +#' @rdname processExpression +#' @export processExpression <- function(gobject, param, name, expression_values = "raw", spat_unit = NULL, feat_type = NULL, - return_gobject = TRUE) { + return_gobject = TRUE, + ...) { ex <- getExpression(gobject, values = expression_values, spat_unit = spat_unit, @@ -796,7 +865,23 @@ processExpression <- function(gobject, param, name, output = "exprObj", set_defaults = TRUE ) - res <- processData(ex, param, name = name) + process_args <- list( + x = ex, + param = param, + name = name, + ... + ) + + # detect svkeys + if (!is.list(param)) param <- list(param) + param_dump <- lapply(param, function(p) { + p@param + }) + has_svk <- .check_svkey(unlist(param_dump), type = "any") + + if (has_svk) process_args$context <- gobject + + res <- do.call(processData, args = process_args) if(!isTRUE(return_gobject)) return(res) setGiotto(gobject, res) } @@ -1007,6 +1092,100 @@ normalizeGiotto <- function(gobject, # internals #### +# * params setup #### +.norm_param_lib <- function(...) { + p <- new("libraryNormParam", param = list(...)) + p$scalefactor <- p$scalefactor %null% 6e3 + p +} +.norm_param_log <- function(...) { + p <- new("logNormParam", param = list(...)) + p$base <- p$base %null% 2 + p$offset <- p$offset %null% 1 + p +} +.norm_param_osmfish <- function(...) { + new("osmFISHNormParam", param = list(...)) +} +.norm_param_pears_resid <- function(...) { + p <- new("pearsonResidNormParam", param = list(...)) + p$theta <- p$theta %null% 100 + p +} +.norm_param_quantile <- function(...) { + new("quantileNormParam", param = list(...)) +} +.norm_param_default <- function(...) { + p <- new("defaultNormParam", param = list(...)) + p$library_size_norm <- p$library_size_norm %null% TRUE + p$scalefactor <- p$scalefactor %null% 6e3 + p$log_norm <- p$log_norm %null% TRUE + p$log_offset <- p$log_offset %null% 1 + p$logbase <- p$logbase %null% 2 + p +} +.norm_param_tfidf <- function(...) { + new("tfidfNormParam", param = list(...)) +} +.norm_param_l2 <- function(...) { + new("l2NormParam", param = list(...)) +} + +.scale_param_zscore <- function(...) { + p <- new("zscoreScaleParam", param = list(...)) + p$scale <- p$scale %null% TRUE + p$center <- p$center %null% TRUE + p$MARGIN <- p$MARGIN %null% 2 + p +} +.scale_param_default <- function(...) { + p <- new("defaultScaleParam", param = list(...)) + p$scale_feats <- p$scale_feats %null% TRUE + p$scale_cells <- p$scale_cells %null% TRUE + p$scale_order <- p$scale_order %null% c("first_feats", "first_cells") + p$verbose <- p$verbose %null% TRUE + p +} + + +.adjust_param_limma <- function(...) { + p <- new("limmaAdjustParam", param = list(...)) + p@param <- if (is.null(p@param$batch_columns)) { + c(p@param, list(batch_columns = NULL)) + } + p@param <- if (is.null(p@param$covariate_columns)) { + c(p@param, list(covariate_columns = NULL)) + } + p +} + + +# * implementations #### + +.check_svkey <- function(x, type = c("all", "any")) { + type <- match.arg(type, choices = c("all", "any")) + if (!inherits(x, "list")) x <- list(x) + res <- vapply(x, FUN = inherits, FUN.VALUE = logical(1L), "svkey") + switch (type, + "any" = any(res), + "all" = all(res) + ) +} + +# get from gobject and ensure order is correct. +# return without cell_IDs col +.get_svkey <- function(x, gobject, sample_order = NULL) { + if (!inherits(x, "list")) x <- list(x) + reslist <- lapply(x, function(key) { + data <- key@get(gobject) + if (!is.null(sample_order)) { + data <- data[match(cell_ID, sample_order)] + } + return(data[, -"cell_ID"]) + }) + Reduce(cbind, reslist) +} + .l2_norm <- function(x) { # Calculate column norms (Euclidean length of each column) col_norms <- sqrt(colSums_flex(x^2)) diff --git a/man/adjust_limma.Rd b/man/adjust_limma.Rd new file mode 100644 index 000000000..55386ae4c --- /dev/null +++ b/man/adjust_limma.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{adjust_limma} +\alias{adjust_limma} +\title{Limma Batch Correction} +\description{ +Batch effect removal via \code{\link[limma:removeBatchEffect]{limma::removeBatchEffect()}} +} +\section{params}{ + + +\tabular{ll}{ +\code{batch_columns} \tab \link[GiottoClass:spatValues]{svkey} (optional) Up to two +columns of information from a Giotto object with information indicating +batches to remove the effects of. \cr +\code{covariate_columns} \tab \link[GiottoClass:spatValues]{svkey} (optional) Columns +of information from a Giotto object with information indicating covariates +to regress out. +} +} + +\examples{ +limma <- adjustParam("limma") +limma$covariate_columns <- svkey(feats = c("nr_feats", "total_expr")) + +g <- GiottoData::loadGiottoMini("visium") +processExpression(g, limma, name = "limma") +} +\seealso{ +\link{process_param} +} +\concept{adjustment parameters} diff --git a/man/processData.Rd b/man/processData.Rd index aee07aaeb..d68376800 100644 --- a/man/processData.Rd +++ b/man/processData.Rd @@ -4,29 +4,36 @@ \alias{processData} \alias{processData,exprObj,list-method} \alias{processData,exprObj,normParam-method} +\alias{processData,exprObj,adjustParam-method} \alias{processData,exprObj,scaleParam-method} \alias{processData,allMatrix,list-method} \title{Composable Data Processing} \usage{ -\S4method{processData}{exprObj,list}(x, param, name = "scaled") +\S4method{processData}{exprObj,list}(x, param, name = "scaled", ...) -\S4method{processData}{exprObj,normParam}(x, param, name = "normalized") +\S4method{processData}{exprObj,normParam}(x, param, name = "normalized", ...) -\S4method{processData}{exprObj,scaleParam}(x, param, name = "scaled") +\S4method{processData}{exprObj,adjustParam}(x, param, name = "custom", ...) -\S4method{processData}{allMatrix,list}(x, param) +\S4method{processData}{exprObj,scaleParam}(x, param, name = "scaled", ...) + +\S4method{processData}{allMatrix,list}(x, param, ...) } \arguments{ \item{x}{data to transform} \item{param}{S4 parameter class defining the transform operation and -params affecting it.} +params affecting it. Can also be a list of several of these objects, acting +as a pipeline.} \item{name}{character. \link[GiottoClass:giotto_schema]{Object name} to assign to the output.} \item{\dots}{additional params to pass} } +\value{ +The same class as \code{x} +} \description{ Perform data transformations, or set up chains of transformations and operations to be applied to matrix type data. \code{processData()} is a generic @@ -52,4 +59,7 @@ processData(m, list(lib_norm, log_norm, zscore_cols, zscore_rows)) \seealso{ \link{process_param} for processing operations that can be performed through \code{processData()} + +\code{\link[=processExpression]{processExpression()}} for the way to use this framework with the +\code{giotto} object } diff --git a/man/processExpression.Rd b/man/processExpression.Rd new file mode 100644 index 000000000..3b90e8e84 --- /dev/null +++ b/man/processExpression.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalize.R +\name{processExpression} +\alias{processExpression} +\title{Expression Data Processing} +\usage{ +processExpression( + gobject, + param, + name, + expression_values = "raw", + spat_unit = NULL, + feat_type = NULL, + return_gobject = TRUE, + ... +) +} +\arguments{ +\item{gobject}{\code{giotto} object} + +\item{param}{S4 parameter class defining the transform operation and +params affecting it. Can also be a list of several of these objects, acting +as a pipeline.} + +\item{name}{character. \link[GiottoClass:giotto_schema]{Object name} to assign +to the output.} + +\item{expression_values}{character. Name of matrix to use} + +\item{spat_unit}{character (optional). spatial unit to use} + +\item{feat_type}{character (optional). feature type to use} + +\item{return_gobject}{logical (optional). Whether to return the \code{gobject}. +When FALSE, the \code{exprObj} is returned instead.} + +\item{...}{additional params to pass} +} +\value{ +A \code{giotto} object when \code{return_gobject = TRUE}. Otherwise, an +\code{exprObj} +} +\description{ +Perform data transformations, or set up chains of transformations and +operations to be applied to expression type data in the \code{giotto} object. +} +\examples{ +g <- GiottoData::loadGiottoMini("visium") +# single operation +processExpression(g, normParam("library"), name = "library") + +# single operation with changed parameter +lib <- normParam("library") +lib$scalefactor = 1000 +processExpression(g, lib, name = "library2") + +# return the exprObj instead +processExpression(g, lib, name = "library2", return_gobject = FALSE) + +# chained operation (this is the Giotto standard normalization) +processExpression(g, + list( + normParam("library"), + normParam("log"), + scaleParam("zscore", MARGIN = 2), + scaleParam("zscore", MARGIN = 1) + ), + name = "scaled2" +) +} +\seealso{ +\link{process_param} for processing operations that can be performed + +\code{\link[=processData]{processData()}} for the lower level generic handling these operations +} diff --git a/man/process_param.Rd b/man/process_param.Rd index cdeaf46fb..a769ae578 100644 --- a/man/process_param.Rd +++ b/man/process_param.Rd @@ -53,7 +53,13 @@ with a \code{MARGIN} param allowing scaling long either cols or rows \section{adjustParam methods}{ \itemize{ -\item \code{"limma"} - limma batch correction +\item \code{\link[=adjust_limma]{"limma"}} - limma batch correction } } +\seealso{ +\code{\link[=processData]{processData()}} for the generic used to apply these params + +\code{\link[=processExpression]{processExpression()}} for the way to use this framework with the +\code{giotto} object +}