Skip to content

Commit

Permalink
feat(layer): add tokenizers for categoricals and numerics
Browse files Browse the repository at this point in the history
  • Loading branch information
sebffischer committed Feb 6, 2025
1 parent 365ef6a commit 3fca75c
Show file tree
Hide file tree
Showing 69 changed files with 817 additions and 0 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ Collate:
'PipeOpTorchOptimizer.R'
'PipeOpTorchReshape.R'
'PipeOpTorchSoftmax.R'
'PipeOpTorchTokenizer.R'
'Select.R'
'TaskClassif_cifar.R'
'TaskClassif_lazy_iris.R'
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ export(PipeOpTorchSqueeze)
export(PipeOpTorchTanh)
export(PipeOpTorchTanhShrink)
export(PipeOpTorchThreshold)
export(PipeOpTorchTokenizerCateg)
export(PipeOpTorchTokenizerNum)
export(PipeOpTorchUnsqueeze)
export(TorchCallback)
export(TorchDescriptor)
Expand Down Expand Up @@ -177,6 +179,8 @@ export(nn_merge_prod)
export(nn_merge_sum)
export(nn_reshape)
export(nn_squeeze)
export(nn_tokenizer_categ)
export(nn_tokenizer_numeric)
export(nn_unsqueeze)
export(pipeop_preproc_torch)
export(replace_head)
Expand Down
224 changes: 224 additions & 0 deletions R/PipeOpTorchTokenizer.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
initialize_token_ = function(x, d, initialization) {
assert_choice(initialization, c("uniform", "normal"))
d_sqrt_inv = 1 / sqrt(d)
if (initialization == "uniform") {
return(nn_init_uniform_(x, a = -d_sqrt_inv, b = d_sqrt_inv))
} else if (initialization == "normal") {
return(nn_init_normal_(x, std=d_sqrt_inv))
} else {
stopf("Invalid initialization: %s", initialization)
}
}

#' @title Numeric Tokenizer
#' @inherit nn_tokenizer_num description
#' @section nn_module:
#' Calls [`nn_tokenizer_numeric()`] when trained where the parameter `n_features` is inferred.
#' The output shape is `(batch, n_features, d_token)`.
#'
#' @section Parameters:
#' * `d_token` :: `integer(1)`\cr
#' The dimension of the embedding.
#' * `bias` :: `logical(1)`\cr
#' Whether to use a bias. Is initialized to `TRUE`.
#' * `initialization` :: `character(1)`\cr
#' The initialization method for the embedding weights. Possible values are `"uniform"` (default)
#' and `"normal"`.
#'
#' @templateVar id nn_tokenizer_num
#' @template pipeop_torch_channels_default
#' @templateVar param_vals d_token = 10
#' @template pipeop_torch
#' @template pipeop_torch_example
#'
#' @export
PipeOpTorchTokenizerNum = R6Class("PipeOpTorchTokenizerNum",
inherit = PipeOpTorch,
public = list(
#' @description Creates a new instance of this [R6][R6::R6Class] class.
#' @template params_pipelines
initialize = function(id = "nn_tokenizer_num", param_vals = list()) {
param_set = ps(
d_token = p_int(lower = 1, tags = c("train", "required")),
bias = p_lgl(init = TRUE, tags = c("train", "required")),
initialization = p_fct(init = "uniform", levels = c("uniform", "normal"), tags = c("train", "required"))
)
super$initialize(
id = id,
param_set = param_set,
param_vals = param_vals,
module_generator = nn_tokenizer_numeric
)
}
),
private = list(
.shape_dependent_params = function(shapes_in, param_vals, task) {
c(param_vals, list(n_features = shapes_in[[1]][2]))
},
.shapes_out = function(shapes_in, param_vals, task) {
if (length(shapes_in[[1]]) != 2) {
stopf("Numeric tokenizer expects 2 input dimensions, but got %i", length(shapes_in))
}
list(c(shapes_in[[1]], param_vals$d_token))
}
)
)

#' @title Numeric Tokenizer
#' @name nn_tokenizer_num
#' @description
#' Tokenizes numeric features into a dense embedding.
#' @param n_features (`integer(1)`)\cr
#' The number of features.
#' @param d_token (`integer(1)`)\cr
#' The dimension of the embedding.
#' @param bias (`logical(1)`)\cr
#' Whether to use a bias.
#' @param initialization (`character(1)`)\cr
#' The initialization method for the embedding weights. Possible values are `"uniform"`
#' and `"normal"`.
#'
#' @references
#' `r format_bib("gorishniy2021revisiting")`
#' @export
nn_tokenizer_numeric = nn_module(
"nn_tokenizer_num",
initialize = function(n_features, d_token, bias, initialization) {
self$n_features = assert_int(n_features, lower = 1L)
self$d_token = assert_int(d_token, lower = 1L)
self$initialization = assert_choice(initialization, c("uniform", "normal"))
assert_flag(bias)

self$weight = nn_parameter(torch_empty(self$n_features, d_token))
if (bias) {
self$bias = nn_parameter(torch_empty(self$n_features, d_token))
} else {
self$bias = NULL
}

self$reset_parameters()
self$n_tokens = self$weight$shape[1]
self$d_token = self$weight$shape[2]
},
reset_parameters = function() {
initialize_token_(self$weight, self$d_token, self$initialization)
if (!is.null(self$bias)) {
initialize_token_(self$bias, self$d_token, self$initialization)
}
},
forward = function(input) {
x = self$weight[NULL] * input[.., NULL]
if (!is.null(self$bias)) {
x = x + self$bias[NULL]
}
return(x)
}
)

#' @title Categorical Tokenizer
#' @name nn_tokenizer_categ
#' @description
#' Tokenizes categorical features into a dense embedding.
#' @param cardinalities (`integer()`)\cr
#' The number of categories for each feature.
#' @param d_token (`integer(1)`)\cr
#' The dimension of the embedding.
#' @param bias (`logical(1)`)\cr
#' Whether to use a bias.
#' @param initialization (`character(1)`)\cr
#' The initialization method for the embedding weights. Possible values are `"uniform"`
#' and `"normal"`.
#'
#' @references
#' `r format_bib("gorishniy2021revisiting")`
#'
#' @export
nn_tokenizer_categ = nn_module(
"nn_tokenizer_categ",
initialize = function(cardinalities, d_token, bias, initialization) {
self$cardinalities = assert_integerish(cardinalities, lower = 1L, any.missing = FALSE,
min.len = 1L, coerce = TRUE)
self$d_token = assert_int(d_token, lower = 1L)

self$initialization = assert_choice(initialization, c("uniform", "normal"))
assert_flag(bias)
cardinalities_cs = cumsum(cardinalities)
category_offsets = torch_tensor(c(0, cardinalities_cs[-length(cardinalities_cs)]),
dtype = torch_long())
self$register_buffer("category_offsets", category_offsets, persistent = FALSE)
n_embeddings = cardinalities_cs[length(cardinalities_cs)]

self$embeddings = nn_embedding(n_embeddings, d_token)
if (bias) {
self$bias = nn_parameter(torch_empty(length(cardinalities), d_token))
} else {
self$bias = NULL
}

self$reset_parameters()
self$n_tokens = self$category_offsets$shape[1]
self$d_token = self$embeddings$embedding_dim
},
reset_parameters = function() {
initialize_token_(self$embeddings$weight, d = self$d_token, self$initialization)
if (!is.null(self$bias)) {
initialize_token_(self$bias, d = self$d_token, self$initialization)
}
},
forward = function(input) {
x = self$embeddings(input + self$category_offsets[NULL])
if (!is.null(self$bias)) {
x = x + self$bias[NULL]
}
return(x)
}
)

#' @title Categorical Tokenizer
#' @inherit nn_tokenizer_categ description
#' @section nn_module:
#' Calls [`nn_tokenizer_categ()`] when trained where the parameter `cardinalities` is inferred.
#' The output shape is `(batch, n_features, d_token)`.
#' @inheritSection mlr_pipeops_nn_tokenizer_num Parameters
#' @templateVar id nn_tokenizer_categ
#' @template pipeop_torch_channels_default
#' @templateVar param_vals d_token = 10
#' @template pipeop_torch
#' @template pipeop_torch_example
#'
#' @export
PipeOpTorchTokenizerCateg = R6Class("PipeOpTorchTokenizerCateg",
inherit = PipeOpTorch,
public = list(
#' @description Creates a new instance of this [R6][R6::R6Class] class.
#' @template params_pipelines
initialize = function(id = "nn_tokenizer_categ", param_vals = list()) {
param_set = ps(
d_token = p_int(lower = 1, tags = c("train", "required")),
bias = p_lgl(init = TRUE, tags = "train"),
initialization = p_fct(init = "uniform", levels = c("uniform", "normal"), tags = "train")
)
super$initialize(
id = id,
param_set = param_set,
param_vals = param_vals,
module_generator = nn_tokenizer_categ
)
}
),
private = list(
.shape_dependent_params = function(shapes_in, param_vals, task) {
c(param_vals, list(cardinalities = lengths(task$levels(task$feature_names))))
},
.shapes_out = function(shapes_in, param_vals, task) {
if (length(shapes_in[[1]]) != 2) {
stopf("Numeric tokenizer expects 2 input dimensions, but got %i", length(shapes_in))
}
list(c(shapes_in[[1]], param_vals$d_token))
}
)
)

#' @include aaa.R
register_po("nn_tokenizer_num", PipeOpTorchTokenizerNum)
register_po("nn_tokenizer_categ", PipeOpTorchTokenizerCateg)
2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_adaptive_avg_pool2d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_adaptive_avg_pool3d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_avg_pool1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_avg_pool2d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_avg_pool3d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_batch_norm1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_batch_norm2d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_batch_norm3d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_block.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_celu.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_conv1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_conv2d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_conv3d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_conv_transpose1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_nn_conv_transpose2d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3fca75c

Please sign in to comment.