feat(layer): add tokenizers for categoricals and numerics

mlr-org · Feb 6, 2025 · 3fca75c · 3fca75c
1 parent 365ef6a
commit 3fca75c
Show file tree

Hide file tree

Showing 69 changed files with 817 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -127,6 +127,7 @@ Collate:
     'PipeOpTorchOptimizer.R'
     'PipeOpTorchReshape.R'
     'PipeOpTorchSoftmax.R'
+    'PipeOpTorchTokenizer.R'
     'Select.R'
     'TaskClassif_cifar.R'
     'TaskClassif_lazy_iris.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -143,6 +143,8 @@ export(PipeOpTorchSqueeze)
 export(PipeOpTorchTanh)
 export(PipeOpTorchTanhShrink)
 export(PipeOpTorchThreshold)
+export(PipeOpTorchTokenizerCateg)
+export(PipeOpTorchTokenizerNum)
 export(PipeOpTorchUnsqueeze)
 export(TorchCallback)
 export(TorchDescriptor)
@@ -177,6 +179,8 @@ export(nn_merge_prod)
 export(nn_merge_sum)
 export(nn_reshape)
 export(nn_squeeze)
+export(nn_tokenizer_categ)
+export(nn_tokenizer_numeric)
 export(nn_unsqueeze)
 export(pipeop_preproc_torch)
 export(replace_head)

diff --git a/R/PipeOpTorchTokenizer.R b/R/PipeOpTorchTokenizer.R
@@ -0,0 +1,224 @@
+initialize_token_ = function(x, d, initialization) {
+  assert_choice(initialization, c("uniform", "normal"))
+  d_sqrt_inv = 1 / sqrt(d)
+  if (initialization == "uniform") {
+    return(nn_init_uniform_(x, a = -d_sqrt_inv, b = d_sqrt_inv))
+  } else if (initialization == "normal") {
+    return(nn_init_normal_(x, std=d_sqrt_inv))
+  } else {
+    stopf("Invalid initialization: %s", initialization)
+  }
+}
+
+#' @title Numeric Tokenizer
+#' @inherit nn_tokenizer_num description
+#' @section nn_module:
+#' Calls [`nn_tokenizer_numeric()`] when trained where the parameter `n_features` is inferred.
+#' The output shape is `(batch, n_features, d_token)`.
+#'
+#' @section Parameters:
+#' * `d_token` :: `integer(1)`\cr
+#'   The dimension of the embedding.
+#' * `bias` :: `logical(1)`\cr
+#'   Whether to use a bias. Is initialized to `TRUE`.
+#' * `initialization` :: `character(1)`\cr
+#'   The initialization method for the embedding weights. Possible values are `"uniform"` (default)
+#'   and `"normal"`.
+#'
+#' @templateVar id nn_tokenizer_num
+#' @template pipeop_torch_channels_default
+#' @templateVar param_vals d_token = 10
+#' @template pipeop_torch
+#' @template pipeop_torch_example
+#'
+#' @export
+PipeOpTorchTokenizerNum = R6Class("PipeOpTorchTokenizerNum",
+  inherit = PipeOpTorch,
+  public = list(
+    #' @description Creates a new instance of this [R6][R6::R6Class] class.
+    #' @template params_pipelines
+    initialize = function(id = "nn_tokenizer_num", param_vals = list()) {
+      param_set = ps(
+        d_token = p_int(lower = 1, tags = c("train", "required")),
+        bias = p_lgl(init = TRUE, tags = c("train", "required")),
+        initialization = p_fct(init = "uniform", levels = c("uniform", "normal"), tags = c("train", "required"))
+      )
+      super$initialize(
+        id = id,
+        param_set = param_set,
+        param_vals = param_vals,
+        module_generator = nn_tokenizer_numeric
+      )
+    }
+  ),
+  private = list(
+    .shape_dependent_params = function(shapes_in, param_vals, task) {
+      c(param_vals, list(n_features = shapes_in[[1]][2]))
+    },
+    .shapes_out = function(shapes_in, param_vals, task) {
+      if (length(shapes_in[[1]]) != 2) {
+        stopf("Numeric tokenizer expects 2 input dimensions, but got %i", length(shapes_in))
+      }
+      list(c(shapes_in[[1]], param_vals$d_token))
+    }
+  )
+)
+
+#' @title Numeric Tokenizer
+#' @name nn_tokenizer_num
+#' @description
+#' Tokenizes numeric features into a dense embedding.
+#' @param n_features (`integer(1)`)\cr
+#'   The number of features.
+#' @param d_token (`integer(1)`)\cr
+#'   The dimension of the embedding.
+#' @param bias (`logical(1)`)\cr
+#'   Whether to use a bias.
+#' @param initialization (`character(1)`)\cr
+#'   The initialization method for the embedding weights. Possible values are `"uniform"`
+#'   and `"normal"`.
+#'
+#' @references
+#' `r format_bib("gorishniy2021revisiting")`
+#' @export
+nn_tokenizer_numeric = nn_module(
+  "nn_tokenizer_num",
+  initialize = function(n_features, d_token, bias, initialization) {
+    self$n_features = assert_int(n_features, lower = 1L)
+    self$d_token = assert_int(d_token, lower = 1L)
+    self$initialization = assert_choice(initialization, c("uniform", "normal"))
+    assert_flag(bias)
+
+    self$weight = nn_parameter(torch_empty(self$n_features, d_token))
+    if (bias) {
+      self$bias = nn_parameter(torch_empty(self$n_features, d_token))
+    } else {
+      self$bias = NULL
+    }
+
+    self$reset_parameters()
+    self$n_tokens = self$weight$shape[1]
+    self$d_token = self$weight$shape[2]
+  },
+  reset_parameters = function() {
+    initialize_token_(self$weight, self$d_token, self$initialization)
+    if (!is.null(self$bias)) {
+      initialize_token_(self$bias, self$d_token, self$initialization)
+    }
+  },
+  forward = function(input) {
+    x = self$weight[NULL] * input[.., NULL]
+    if (!is.null(self$bias)) {
+      x = x + self$bias[NULL]
+    }
+    return(x)
+  }
+)
+
+#' @title Categorical Tokenizer
+#' @name nn_tokenizer_categ
+#' @description
+#' Tokenizes categorical features into a dense embedding.
+#' @param cardinalities (`integer()`)\cr
+#'   The number of categories for each feature.
+#' @param d_token (`integer(1)`)\cr
+#'   The dimension of the embedding.
+#' @param bias (`logical(1)`)\cr
+#'   Whether to use a bias.
+#' @param initialization (`character(1)`)\cr
+#'   The initialization method for the embedding weights. Possible values are `"uniform"`
+#'   and `"normal"`.
+#'
+#' @references
+#' `r format_bib("gorishniy2021revisiting")`
+#'
+#' @export
+nn_tokenizer_categ = nn_module(
+  "nn_tokenizer_categ",
+  initialize = function(cardinalities, d_token, bias, initialization) {
+    self$cardinalities = assert_integerish(cardinalities, lower = 1L, any.missing = FALSE,
+      min.len = 1L, coerce = TRUE)
+    self$d_token = assert_int(d_token, lower = 1L)
+
+    self$initialization = assert_choice(initialization, c("uniform", "normal"))
+    assert_flag(bias)
+    cardinalities_cs = cumsum(cardinalities)
+    category_offsets = torch_tensor(c(0, cardinalities_cs[-length(cardinalities_cs)]),
+      dtype = torch_long())
+    self$register_buffer("category_offsets", category_offsets, persistent = FALSE)
+    n_embeddings = cardinalities_cs[length(cardinalities_cs)]
+
+    self$embeddings = nn_embedding(n_embeddings, d_token)
+    if (bias) {
+      self$bias = nn_parameter(torch_empty(length(cardinalities), d_token))
+    } else {
+      self$bias = NULL
+    }
+
+    self$reset_parameters()
+    self$n_tokens = self$category_offsets$shape[1]
+    self$d_token = self$embeddings$embedding_dim
+  },
+  reset_parameters = function() {
+    initialize_token_(self$embeddings$weight, d = self$d_token, self$initialization)
+    if (!is.null(self$bias)) {
+      initialize_token_(self$bias, d = self$d_token, self$initialization)
+    }
+  },
+  forward = function(input) {
+    x = self$embeddings(input + self$category_offsets[NULL])
+    if (!is.null(self$bias)) {
+      x = x + self$bias[NULL]
+    }
+    return(x)
+  }
+)
+
+#' @title Categorical Tokenizer
+#' @inherit nn_tokenizer_categ description
+#' @section nn_module:
+#' Calls [`nn_tokenizer_categ()`] when trained where the parameter `cardinalities` is inferred.
+#' The output shape is `(batch, n_features, d_token)`.
+#' @inheritSection mlr_pipeops_nn_tokenizer_num Parameters
+#' @templateVar id nn_tokenizer_categ
+#' @template pipeop_torch_channels_default
+#' @templateVar param_vals d_token = 10
+#' @template pipeop_torch
+#' @template pipeop_torch_example
+#'
+#' @export
+PipeOpTorchTokenizerCateg = R6Class("PipeOpTorchTokenizerCateg",
+  inherit = PipeOpTorch,
+  public = list(
+    #' @description Creates a new instance of this [R6][R6::R6Class] class.
+    #' @template params_pipelines
+    initialize = function(id = "nn_tokenizer_categ", param_vals = list()) {
+      param_set = ps(
+        d_token = p_int(lower = 1, tags = c("train", "required")),
+        bias = p_lgl(init = TRUE, tags = "train"),
+        initialization = p_fct(init = "uniform", levels = c("uniform", "normal"), tags = "train")
+      )
+      super$initialize(
+        id = id,
+        param_set = param_set,
+        param_vals = param_vals,
+        module_generator = nn_tokenizer_categ
+      )
+    }
+  ),
+  private = list(
+    .shape_dependent_params = function(shapes_in, param_vals, task) {
+      c(param_vals, list(cardinalities = lengths(task$levels(task$feature_names))))
+    },
+    .shapes_out = function(shapes_in, param_vals, task) {
+      if (length(shapes_in[[1]]) != 2) {
+        stopf("Numeric tokenizer expects 2 input dimensions, but got %i", length(shapes_in))
+      }
+      list(c(shapes_in[[1]], param_vals$d_token))
+    }
+  )
+)
+
+#' @include aaa.R
+register_po("nn_tokenizer_num", PipeOpTorchTokenizerNum)
+register_po("nn_tokenizer_categ", PipeOpTorchTokenizerCateg)
diff --git a/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd b/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd
diff --git a/man/mlr_pipeops_nn_adaptive_avg_pool2d.Rd b/man/mlr_pipeops_nn_adaptive_avg_pool2d.Rd
diff --git a/man/mlr_pipeops_nn_adaptive_avg_pool3d.Rd b/man/mlr_pipeops_nn_adaptive_avg_pool3d.Rd
diff --git a/man/mlr_pipeops_nn_avg_pool1d.Rd b/man/mlr_pipeops_nn_avg_pool1d.Rd
diff --git a/man/mlr_pipeops_nn_avg_pool2d.Rd b/man/mlr_pipeops_nn_avg_pool2d.Rd
diff --git a/man/mlr_pipeops_nn_avg_pool3d.Rd b/man/mlr_pipeops_nn_avg_pool3d.Rd
diff --git a/man/mlr_pipeops_nn_batch_norm1d.Rd b/man/mlr_pipeops_nn_batch_norm1d.Rd
diff --git a/man/mlr_pipeops_nn_batch_norm2d.Rd b/man/mlr_pipeops_nn_batch_norm2d.Rd
diff --git a/man/mlr_pipeops_nn_batch_norm3d.Rd b/man/mlr_pipeops_nn_batch_norm3d.Rd
diff --git a/man/mlr_pipeops_nn_block.Rd b/man/mlr_pipeops_nn_block.Rd
diff --git a/man/mlr_pipeops_nn_celu.Rd b/man/mlr_pipeops_nn_celu.Rd
diff --git a/man/mlr_pipeops_nn_conv1d.Rd b/man/mlr_pipeops_nn_conv1d.Rd
diff --git a/man/mlr_pipeops_nn_conv2d.Rd b/man/mlr_pipeops_nn_conv2d.Rd
diff --git a/man/mlr_pipeops_nn_conv3d.Rd b/man/mlr_pipeops_nn_conv3d.Rd
diff --git a/man/mlr_pipeops_nn_conv_transpose1d.Rd b/man/mlr_pipeops_nn_conv_transpose1d.Rd
diff --git a/man/mlr_pipeops_nn_conv_transpose2d.Rd b/man/mlr_pipeops_nn_conv_transpose2d.Rd