diff --git a/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc b/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc index 40d9a8b28a0b8..acce68328eba9 100644 --- a/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc +++ b/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc @@ -195,6 +195,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -249,6 +253,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -296,6 +304,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -366,6 +378,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -413,6 +429,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -475,6 +495,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] diff --git a/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc b/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc index 747113be98c2e..7db379b65c2e1 100644 --- a/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc +++ b/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc @@ -454,6 +454,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -496,6 +500,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -532,6 +540,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -591,6 +603,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -626,6 +642,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -677,6 +697,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc index 37133c27e5319..8738b754f41f8 100644 --- a/docs/reference/ml/ml-shared.asciidoc +++ b/docs/reference/ml/ml-shared.asciidoc @@ -925,6 +925,22 @@ Specifies if the tokenization lower case the text sequence when building the tokens. end::inference-config-nlp-tokenization-bert-do-lower-case[] +tag::inference-config-nlp-tokenization-bert-truncate[] +Indicates how tokens are truncated when they exceed `max_sequence_length`. +The default value is `first`. ++ +-- +* `none`: No truncation occurs; the inference request receives an error. +* `first`: Only the first sequence is truncated. +* `second`: Only the second sequence is truncated. If there is just one sequence, + that sequence is truncated. +-- + +NOTE: For `zero_shot_classification`, the hypothesis sequence is always the second +sequence. Therefore, do not use `second` in this case. + +end::inference-config-nlp-tokenization-bert-truncate[] + tag::inference-config-nlp-tokenization-bert-with-special-tokens[] Tokenize with special tokens. The tokens typically included in BERT-style tokenization are: +