diff --git a/bindings/node/CHANGELOG.md b/bindings/node/CHANGELOG.md index b45d058b0..5d785e2f1 100644 --- a/bindings/node/CHANGELOG.md +++ b/bindings/node/CHANGELOG.md @@ -1,3 +1,8 @@ +## [0.13.0] + +- [#1008] `Decoder` is now a composable trait, but without being backward incompatible +- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible + ## [0.12.1] - [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520 @@ -160,6 +165,13 @@ The files must now be provided first when calling `tokenizer.train(files, traine - Actually add special tokens in tokenizers implementations ([acef252](https://github.com/huggingface/tokenizers/commit/acef252dacc43adc414175cfc325668ad1488753)) +[#956]: https://github.com/huggingface/tokenizers/pull/956 +[#1008]: https://github.com/huggingface/tokenizers/pull/1008 +[#1009]: https://github.com/huggingface/tokenizers/pull/1009 +[#1047]: https://github.com/huggingface/tokenizers/pull/1047 +[#1055]: https://github.com/huggingface/tokenizers/pull/1055 +[#1051]: https://github.com/huggingface/tokenizers/pull/1051 +[#1052]: https://github.com/huggingface/tokenizers/pull/1052 [#938]: https://github.com/huggingface/tokenizers/pull/938 [#939]: https://github.com/huggingface/tokenizers/pull/939 [#952]: https://github.com/huggingface/tokenizers/pull/952 diff --git a/bindings/node/package.json b/bindings/node/package.json index 88c2f50c2..f1aa4bf24 100644 --- a/bindings/node/package.json +++ b/bindings/node/package.json @@ -1,6 +1,6 @@ { "name": "tokenizers", - "version": "0.12.1", + "version": "0.13.0", "description": "", "main": "./dist/index.js", "types": "./dist/index.d.ts", diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 6b0b34af0..0c8984413 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.13.0] + +- [#956] PyO3 version upgrade +- [#1055] M1 automated builds +- [#1008] `Decoder` is now a composable trait, but without being backward incompatible +- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible + +Both trait changes warrant a "major" number since, despite best efforts to not break backward + compatibility, the code is different enough that we cannot be exactly sure. + ## [0.12.1] - [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520 @@ -376,6 +386,13 @@ delimiter (Works like `.split(delimiter)`) - Fix a bug that was causing crashes in Python 3.5 +[#956]: https://github.com/huggingface/tokenizers/pull/956 +[#1008]: https://github.com/huggingface/tokenizers/pull/1008 +[#1009]: https://github.com/huggingface/tokenizers/pull/1009 +[#1047]: https://github.com/huggingface/tokenizers/pull/1047 +[#1055]: https://github.com/huggingface/tokenizers/pull/1055 +[#1051]: https://github.com/huggingface/tokenizers/pull/1051 +[#1052]: https://github.com/huggingface/tokenizers/pull/1052 [#938]: https://github.com/huggingface/tokenizers/pull/938 [#939]: https://github.com/huggingface/tokenizers/pull/939 [#952]: https://github.com/huggingface/tokenizers/pull/952 diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 1867a9ed3..5a96c230e 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -1700,7 +1700,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokenizers" -version = "0.12.1" +version = "0.13.0" dependencies = [ "aho-corasick", "cached-path", diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index c4ba61ce4..3ae412e61 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.12.1.dev0" +__version__ = "0.13.0.dev0" from typing import Tuple, Union, Tuple, List from enum import Enum diff --git a/bindings/python/setup.py b/bindings/python/setup.py index f3744ee64..649653c6b 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -8,7 +8,7 @@ setup( name="tokenizers", - version="0.12.1.dev0", + version="0.13.0.dev0", description="Fast and Customizable Tokenizers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/tokenizers/CHANGELOG.md b/tokenizers/CHANGELOG.md index ad986235a..79001bddd 100644 --- a/tokenizers/CHANGELOG.md +++ b/tokenizers/CHANGELOG.md @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.13.0] + +- [#1009] `unstable_wasm` feature to support building on Wasm (it's unstable !) +- [#1008] `Decoder` is now a composable trait, but without being backward incompatible +- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible + +Both trait changes warrant a "major" number since, despite best efforts to not break backward + compatibility, the code is different enough that we cannot be exactly sure. + ## [0.12.1] - [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520 @@ -157,6 +166,13 @@ split up in multiple bytes - [#174]: The `LongestFirst` truncation strategy had a bug +[#956]: https://github.com/huggingface/tokenizers/pull/956 +[#1008]: https://github.com/huggingface/tokenizers/pull/1008 +[#1009]: https://github.com/huggingface/tokenizers/pull/1009 +[#1047]: https://github.com/huggingface/tokenizers/pull/1047 +[#1055]: https://github.com/huggingface/tokenizers/pull/1055 +[#1051]: https://github.com/huggingface/tokenizers/pull/1051 +[#1052]: https://github.com/huggingface/tokenizers/pull/1052 [#938]: https://github.com/huggingface/tokenizers/pull/938 [#939]: https://github.com/huggingface/tokenizers/pull/939 [#952]: https://github.com/huggingface/tokenizers/pull/952 diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 222aa0275..3f4877c88 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -2,7 +2,7 @@ authors = ["Anthony MOI "] edition = "2018" name = "tokenizers" -version = "0.12.1" +version = "0.13.0" homepage = "https://github.com/huggingface/tokenizers" repository = "https://github.com/huggingface/tokenizers" documentation = "https://docs.rs/tokenizers/" diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 243f1a137..b8468afc1 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -484,7 +484,7 @@ mod tests { ); let expected = Encoding::new( vec![0; 5], - vec![], + vec![0; 5], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), @@ -508,7 +508,7 @@ mod tests { let pair_expected = Encoding::new( vec![0; 10], - vec![], + vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs index 39a834cd2..627f9d180 100644 --- a/tokenizers/src/processors/bert.rs +++ b/tokenizers/src/processors/bert.rs @@ -195,4 +195,91 @@ mod tests { bert ); } + + #[test] + fn bert_processing() { + let processor = BertProcessing::default(); + assert_eq!(processor.added_tokens(false), 2); + assert_eq!(processor.added_tokens(true), 3); + + use crate::Token; + let encoding = Encoding::from_tokens( + vec![ + Token::new(12, "Hello".into(), (0, 5)), + Token::new(14, "there".into(), (6, 11)), + ], + 0, + ); + let pair = Encoding::from_tokens(vec![Token::new(15, "pair".into(), (0, 4))], 0); + let single_encoding = processor.process(encoding.clone(), None, true).unwrap(); + assert_eq!( + single_encoding, + Encoding::new( + vec![101, 12, 14, 102], + vec![0, 0, 0, 0], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into() + ], + vec![None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0)], + vec![1, 0, 0, 1], + vec![1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3)]), + ) + ); + assert_eq!(single_encoding.token_to_sequence(2), Some(0)); + assert_eq!(single_encoding.token_to_sequence(3), None); + let pair_encoding = processor + .process(encoding.clone(), Some(pair.clone()), true) + .unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![101, 12, 14, 102, 15, 102], + vec![0, 0, 0, 0, 1, 1], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into(), + "pair".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (0, 0)], + vec![1, 0, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(2), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(3), None); + assert_eq!(pair_encoding.token_to_sequence(4), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(5), None); + + // No special tokens + let pair_encoding = processor.process(encoding, Some(pair), false).unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![12, 14, 15], + vec![0, 0, 1], + vec!["Hello".into(), "there".into(), "pair".into(),], + vec![None, None, None], + vec![(0, 5), (6, 11), (0, 4)], + vec![0, 0, 0], + vec![1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(0), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(1), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(2), Some(1)); + } } diff --git a/tokenizers/src/processors/roberta.rs b/tokenizers/src/processors/roberta.rs index ab83e4629..749164182 100644 --- a/tokenizers/src/processors/roberta.rs +++ b/tokenizers/src/processors/roberta.rs @@ -146,7 +146,7 @@ impl PostProcessor for RobertaProcessing { ) } else { let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat(); - let pair_type_ids = vec![0; encoding.get_ids().len() + 2]; + let pair_type_ids = vec![1; encoding.get_ids().len() + 2]; let pair_tokens = [ &[self.sep.0.clone()], encoding.get_tokens(), @@ -176,7 +176,7 @@ impl PostProcessor for RobertaProcessing { .map(|encoding| { let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat(); - let pair_type_ids = vec![0; encoding.get_ids().len() + 2]; + let pair_type_ids = vec![1; encoding.get_ids().len() + 2]; let pair_tokens = [ &[self.sep.0.clone()], encoding.get_tokens(), @@ -240,4 +240,88 @@ mod tests { roberta ); } + + #[test] + fn roberta_processing() { + let processor = RobertaProcessing::default(); + assert_eq!(processor.added_tokens(false), 2); + assert_eq!(processor.added_tokens(true), 4); + + use crate::Token; + let encoding = Encoding::from_tokens( + vec![ + Token::new(12, "Hello".into(), (0, 5)), + Token::new(14, "there".into(), (6, 11)), + ], + 0, + ); + let pair = Encoding::from_tokens(vec![Token::new(15, "pair".into(), (0, 4))], 0); + let single_encoding = processor.process(encoding.clone(), None, true).unwrap(); + assert_eq!( + single_encoding, + Encoding::new( + vec![0, 12, 14, 2], + vec![0, 0, 0, 0], + vec!["".into(), "Hello".into(), "there".into(), "".into()], + vec![None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0)], + vec![1, 0, 0, 1], + vec![1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3)]), + ) + ); + assert_eq!(single_encoding.token_to_sequence(2), Some(0)); + assert_eq!(single_encoding.token_to_sequence(3), None); + let pair_encoding = processor + .process(encoding.clone(), Some(pair.clone()), true) + .unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![0, 12, 14, 2, 2, 15, 2], + vec![0, 0, 0, 0, 1, 1, 1], + vec![ + "".into(), + "Hello".into(), + "there".into(), + "".into(), + "".into(), + "pair".into(), + "".into() + ], + vec![None, None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 0), (0, 4), (0, 0)], + vec![1, 0, 0, 1, 1, 0, 1], + vec![1, 1, 1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3), (1, 5..6)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(2), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(3), None); + assert_eq!(pair_encoding.token_to_sequence(4), None); + assert_eq!(pair_encoding.token_to_sequence(5), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(6), None); + + // No special tokens + let pair_encoding = processor.process(encoding, Some(pair), false).unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![12, 14, 15], + vec![0, 0, 1], + vec!["Hello".into(), "there".into(), "pair".into(),], + vec![None, None, None], + vec![(0, 5), (6, 11), (0, 4)], + vec![0, 0, 0], + vec![1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(0), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(1), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(2), Some(1)); + } } diff --git a/tokenizers/src/processors/sequence.rs b/tokenizers/src/processors/sequence.rs index 21d58d416..66c670ad8 100644 --- a/tokenizers/src/processors/sequence.rs +++ b/tokenizers/src/processors/sequence.rs @@ -47,7 +47,7 @@ mod tests { fn process_chain() { let start = Encoding::new( vec![0; 5], - vec![], + vec![0; 5], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), @@ -67,7 +67,7 @@ mod tests { let sequence = Sequence::new(vec![PostProcessorWrapper::ByteLevel(bytelevel)]); let expected = Encoding::new( vec![0; 5], - vec![], + vec![0; 5], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), @@ -94,7 +94,7 @@ mod tests { let pair_expected = Encoding::new( vec![0; 10], - vec![], + vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 262323cfa..13bc91e56 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -885,6 +885,182 @@ mod tests { assert_eq!(pair_encoding.token_to_sequence(5), None); } + #[test] + fn template_processing_overflowing() { + let processor = tests::get_bert_template(); + assert_eq!(processor.added_tokens(false), 2); + assert_eq!(processor.added_tokens(true), 3); + + use crate::Token; + let mut encoding = Encoding::from_tokens( + vec![ + Token::new(12, "Hello".into(), (0, 5)), + Token::new(14, "there".into(), (6, 11)), + ], + 0, + ); + let overflowing = Encoding::from_tokens(vec![Token::new(13, "you".into(), (12, 15))], 0); + encoding.set_overflowing(vec![overflowing]); + + let mut pair = Encoding::from_tokens( + vec![ + Token::new(15, "pair".into(), (0, 4)), + Token::new(16, "with".into(), (5, 9)), + ], + 0, + ); + let pair_overflowing = + Encoding::from_tokens(vec![Token::new(17, "info".into(), (10, 14))], 0); + pair.set_overflowing(vec![pair_overflowing]); + + let single_encoding = processor.process(encoding.clone(), None, true).unwrap(); + assert_eq!( + single_encoding, + Encoding::new( + vec![1, 12, 14, 0], + vec![0, 0, 0, 0], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into() + ], + vec![None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0)], + vec![1, 0, 0, 1], + vec![1, 1, 1, 1], + vec![Encoding::new( + vec![1, 13, 0], + vec![0, 0, 0], + vec!["[CLS]".into(), "you".into(), "[SEP]".into()], + vec![None, None, None], + vec![(0, 0), (12, 15), (0, 0)], + vec![1, 0, 1], + vec![1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2)]), + )], + HashMap::from_iter(vec![(0, 1..3)]), + ) + ); + assert_eq!(single_encoding.token_to_sequence(2), Some(0)); + assert_eq!(single_encoding.token_to_sequence(3), None); + let pair_encoding = processor.process(encoding, Some(pair), true).unwrap(); + println!("{pair_encoding:#?}"); + assert_eq!( + pair_encoding, + Encoding::new( + vec![1, 12, 14, 0, 15, 16, 0], + vec![0, 0, 0, 0, 1, 1, 1], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into(), + "pair".into(), + "with".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (5, 9), (0, 0)], + vec![1, 0, 0, 1, 0, 0, 1], + vec![1, 1, 1, 1, 1, 1, 1], + vec![ + Encoding::new( + vec![1, 13, 0, 15, 16, 0], + vec![0, 0, 0, 1, 1, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "pair".into(), + "with".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None], + vec![(0, 0), (12, 15), (0, 0), (0, 4), (5, 9), (0, 0)], + vec![1, 0, 1, 0, 0, 1], + vec![1, 1, 1, 1, 1, 1], + vec![Encoding::new( + vec![1, 13, 0, 17, 0], + vec![0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None,], + vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]), + ),], + HashMap::from_iter(vec![(1, 3..5), (0, 1..2)]), + ), + Encoding::new( + vec![1, 13, 0, 17, 0], + vec![0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None,], + vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]), + ), + Encoding::new( + vec![1, 12, 14, 0, 17, 0], + vec![0, 0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1, 1], + vec![Encoding::new( + vec![1, 13, 0, 17, 0], + vec![0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None,], + vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]), + ),], + HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]), + ) + ], + HashMap::from_iter(vec![(0, 1..3), (1, 4..6)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(2), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(3), None); + assert_eq!(pair_encoding.token_to_sequence(4), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(5), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(6), None); + } #[test] fn pair_must_use_both_sequences() { let processor = TemplateProcessing::builder() diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs index b1b4e03c0..c6274c2f2 100644 --- a/tokenizers/src/tokenizer/encoding.rs +++ b/tokenizers/src/tokenizer/encoding.rs @@ -176,6 +176,10 @@ impl Encoding { &self.overflowing } + pub fn set_overflowing(&mut self, overflowing: Vec) { + self.overflowing = overflowing; + } + pub fn get_overflowing_mut(&mut self) -> &mut Vec { &mut self.overflowing } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 6ccec4d73..a1075e1e0 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -100,14 +100,21 @@ pub trait PostProcessor { pair_encoding: Option, add_special_tokens: bool, ) -> Result { - let encodings = if let Some(pair_encoding) = pair_encoding { + let mut encodings = if let Some(pair_encoding) = pair_encoding { vec![encoding, pair_encoding] } else { vec![encoding] }; + encodings.iter_mut().enumerate().for_each(|(i, encoding)| { + encoding.set_sequence_id(i); + encoding + .get_overflowing_mut() + .iter_mut() + .for_each(|encoding| encoding.set_sequence_id(i)); + encoding.set_type_ids(vec![i as u32; encoding.len()]); + }); let encodings = self.process_encodings(encodings, add_special_tokens)?; - Ok(Encoding::merge(encodings, false)) }