From cc60f86f7c84ffa7abe777f550f10f1d1fd79949 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 25 Aug 2022 16:47:06 +0200 Subject: [PATCH 1/4] Preparing rc1 release. --- bindings/node/CHANGELOG.md | 12 ++++++++++++ bindings/node/package.json | 2 +- bindings/python/CHANGELOG.md | 17 +++++++++++++++++ bindings/python/py_src/tokenizers/__init__.py | 2 +- bindings/python/setup.py | 2 +- tokenizers/CHANGELOG.md | 16 ++++++++++++++++ tokenizers/Cargo.toml | 2 +- 7 files changed, 49 insertions(+), 4 deletions(-) diff --git a/bindings/node/CHANGELOG.md b/bindings/node/CHANGELOG.md index b45d058b0..5d785e2f1 100644 --- a/bindings/node/CHANGELOG.md +++ b/bindings/node/CHANGELOG.md @@ -1,3 +1,8 @@ +## [0.13.0] + +- [#1008] `Decoder` is now a composable trait, but without being backward incompatible +- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible + ## [0.12.1] - [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520 @@ -160,6 +165,13 @@ The files must now be provided first when calling `tokenizer.train(files, traine - Actually add special tokens in tokenizers implementations ([acef252](https://github.com/huggingface/tokenizers/commit/acef252dacc43adc414175cfc325668ad1488753)) +[#956]: https://github.com/huggingface/tokenizers/pull/956 +[#1008]: https://github.com/huggingface/tokenizers/pull/1008 +[#1009]: https://github.com/huggingface/tokenizers/pull/1009 +[#1047]: https://github.com/huggingface/tokenizers/pull/1047 +[#1055]: https://github.com/huggingface/tokenizers/pull/1055 +[#1051]: https://github.com/huggingface/tokenizers/pull/1051 +[#1052]: https://github.com/huggingface/tokenizers/pull/1052 [#938]: https://github.com/huggingface/tokenizers/pull/938 [#939]: https://github.com/huggingface/tokenizers/pull/939 [#952]: https://github.com/huggingface/tokenizers/pull/952 diff --git a/bindings/node/package.json b/bindings/node/package.json index 88c2f50c2..f1aa4bf24 100644 --- a/bindings/node/package.json +++ b/bindings/node/package.json @@ -1,6 +1,6 @@ { "name": "tokenizers", - "version": "0.12.1", + "version": "0.13.0", "description": "", "main": "./dist/index.js", "types": "./dist/index.d.ts", diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 6b0b34af0..0c8984413 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.13.0] + +- [#956] PyO3 version upgrade +- [#1055] M1 automated builds +- [#1008] `Decoder` is now a composable trait, but without being backward incompatible +- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible + +Both trait changes warrant a "major" number since, despite best efforts to not break backward + compatibility, the code is different enough that we cannot be exactly sure. + ## [0.12.1] - [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520 @@ -376,6 +386,13 @@ delimiter (Works like `.split(delimiter)`) - Fix a bug that was causing crashes in Python 3.5 +[#956]: https://github.com/huggingface/tokenizers/pull/956 +[#1008]: https://github.com/huggingface/tokenizers/pull/1008 +[#1009]: https://github.com/huggingface/tokenizers/pull/1009 +[#1047]: https://github.com/huggingface/tokenizers/pull/1047 +[#1055]: https://github.com/huggingface/tokenizers/pull/1055 +[#1051]: https://github.com/huggingface/tokenizers/pull/1051 +[#1052]: https://github.com/huggingface/tokenizers/pull/1052 [#938]: https://github.com/huggingface/tokenizers/pull/938 [#939]: https://github.com/huggingface/tokenizers/pull/939 [#952]: https://github.com/huggingface/tokenizers/pull/952 diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index c4ba61ce4..3ae412e61 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.12.1.dev0" +__version__ = "0.13.0.dev0" from typing import Tuple, Union, Tuple, List from enum import Enum diff --git a/bindings/python/setup.py b/bindings/python/setup.py index f3744ee64..649653c6b 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -8,7 +8,7 @@ setup( name="tokenizers", - version="0.12.1.dev0", + version="0.13.0.dev0", description="Fast and Customizable Tokenizers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/tokenizers/CHANGELOG.md b/tokenizers/CHANGELOG.md index ad986235a..79001bddd 100644 --- a/tokenizers/CHANGELOG.md +++ b/tokenizers/CHANGELOG.md @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.13.0] + +- [#1009] `unstable_wasm` feature to support building on Wasm (it's unstable !) +- [#1008] `Decoder` is now a composable trait, but without being backward incompatible +- [#1047, #1051, #1052] `Processor` is now a composable trait, but without being backward incompatible + +Both trait changes warrant a "major" number since, despite best efforts to not break backward + compatibility, the code is different enough that we cannot be exactly sure. + ## [0.12.1] - [#938] **Reverted breaking change**. https://github.com/huggingface/transformers/issues/16520 @@ -157,6 +166,13 @@ split up in multiple bytes - [#174]: The `LongestFirst` truncation strategy had a bug +[#956]: https://github.com/huggingface/tokenizers/pull/956 +[#1008]: https://github.com/huggingface/tokenizers/pull/1008 +[#1009]: https://github.com/huggingface/tokenizers/pull/1009 +[#1047]: https://github.com/huggingface/tokenizers/pull/1047 +[#1055]: https://github.com/huggingface/tokenizers/pull/1055 +[#1051]: https://github.com/huggingface/tokenizers/pull/1051 +[#1052]: https://github.com/huggingface/tokenizers/pull/1052 [#938]: https://github.com/huggingface/tokenizers/pull/938 [#939]: https://github.com/huggingface/tokenizers/pull/939 [#952]: https://github.com/huggingface/tokenizers/pull/952 diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 222aa0275..3f4877c88 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -2,7 +2,7 @@ authors = ["Anthony MOI "] edition = "2018" name = "tokenizers" -version = "0.12.1" +version = "0.13.0" homepage = "https://github.com/huggingface/tokenizers" repository = "https://github.com/huggingface/tokenizers" documentation = "https://docs.rs/tokenizers/" From 471413974e29f2c39230680f93438cdeef2d405f Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 31 Aug 2022 12:55:21 +0200 Subject: [PATCH 2/4] Fixing test_alignment_methods --- bindings/python/Cargo.lock | 2 +- tokenizers/src/pre_tokenizers/byte_level.rs | 4 +- tokenizers/src/processors/bert.rs | 87 ++++++++++++++++++++ tokenizers/src/processors/roberta.rs | 88 ++++++++++++++++++++- tokenizers/src/processors/sequence.rs | 6 +- tokenizers/src/tokenizer/mod.rs | 7 +- 6 files changed, 184 insertions(+), 10 deletions(-) diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 1867a9ed3..5a96c230e 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -1700,7 +1700,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokenizers" -version = "0.12.1" +version = "0.13.0" dependencies = [ "aho-corasick", "cached-path", diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 243f1a137..b8468afc1 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -484,7 +484,7 @@ mod tests { ); let expected = Encoding::new( vec![0; 5], - vec![], + vec![0; 5], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), @@ -508,7 +508,7 @@ mod tests { let pair_expected = Encoding::new( vec![0; 10], - vec![], + vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs index 39a834cd2..627f9d180 100644 --- a/tokenizers/src/processors/bert.rs +++ b/tokenizers/src/processors/bert.rs @@ -195,4 +195,91 @@ mod tests { bert ); } + + #[test] + fn bert_processing() { + let processor = BertProcessing::default(); + assert_eq!(processor.added_tokens(false), 2); + assert_eq!(processor.added_tokens(true), 3); + + use crate::Token; + let encoding = Encoding::from_tokens( + vec![ + Token::new(12, "Hello".into(), (0, 5)), + Token::new(14, "there".into(), (6, 11)), + ], + 0, + ); + let pair = Encoding::from_tokens(vec![Token::new(15, "pair".into(), (0, 4))], 0); + let single_encoding = processor.process(encoding.clone(), None, true).unwrap(); + assert_eq!( + single_encoding, + Encoding::new( + vec![101, 12, 14, 102], + vec![0, 0, 0, 0], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into() + ], + vec![None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0)], + vec![1, 0, 0, 1], + vec![1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3)]), + ) + ); + assert_eq!(single_encoding.token_to_sequence(2), Some(0)); + assert_eq!(single_encoding.token_to_sequence(3), None); + let pair_encoding = processor + .process(encoding.clone(), Some(pair.clone()), true) + .unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![101, 12, 14, 102, 15, 102], + vec![0, 0, 0, 0, 1, 1], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into(), + "pair".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (0, 0)], + vec![1, 0, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(2), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(3), None); + assert_eq!(pair_encoding.token_to_sequence(4), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(5), None); + + // No special tokens + let pair_encoding = processor.process(encoding, Some(pair), false).unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![12, 14, 15], + vec![0, 0, 1], + vec!["Hello".into(), "there".into(), "pair".into(),], + vec![None, None, None], + vec![(0, 5), (6, 11), (0, 4)], + vec![0, 0, 0], + vec![1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(0), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(1), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(2), Some(1)); + } } diff --git a/tokenizers/src/processors/roberta.rs b/tokenizers/src/processors/roberta.rs index ab83e4629..749164182 100644 --- a/tokenizers/src/processors/roberta.rs +++ b/tokenizers/src/processors/roberta.rs @@ -146,7 +146,7 @@ impl PostProcessor for RobertaProcessing { ) } else { let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat(); - let pair_type_ids = vec![0; encoding.get_ids().len() + 2]; + let pair_type_ids = vec![1; encoding.get_ids().len() + 2]; let pair_tokens = [ &[self.sep.0.clone()], encoding.get_tokens(), @@ -176,7 +176,7 @@ impl PostProcessor for RobertaProcessing { .map(|encoding| { let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat(); - let pair_type_ids = vec![0; encoding.get_ids().len() + 2]; + let pair_type_ids = vec![1; encoding.get_ids().len() + 2]; let pair_tokens = [ &[self.sep.0.clone()], encoding.get_tokens(), @@ -240,4 +240,88 @@ mod tests { roberta ); } + + #[test] + fn roberta_processing() { + let processor = RobertaProcessing::default(); + assert_eq!(processor.added_tokens(false), 2); + assert_eq!(processor.added_tokens(true), 4); + + use crate::Token; + let encoding = Encoding::from_tokens( + vec![ + Token::new(12, "Hello".into(), (0, 5)), + Token::new(14, "there".into(), (6, 11)), + ], + 0, + ); + let pair = Encoding::from_tokens(vec![Token::new(15, "pair".into(), (0, 4))], 0); + let single_encoding = processor.process(encoding.clone(), None, true).unwrap(); + assert_eq!( + single_encoding, + Encoding::new( + vec![0, 12, 14, 2], + vec![0, 0, 0, 0], + vec!["".into(), "Hello".into(), "there".into(), "".into()], + vec![None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0)], + vec![1, 0, 0, 1], + vec![1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3)]), + ) + ); + assert_eq!(single_encoding.token_to_sequence(2), Some(0)); + assert_eq!(single_encoding.token_to_sequence(3), None); + let pair_encoding = processor + .process(encoding.clone(), Some(pair.clone()), true) + .unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![0, 12, 14, 2, 2, 15, 2], + vec![0, 0, 0, 0, 1, 1, 1], + vec![ + "".into(), + "Hello".into(), + "there".into(), + "".into(), + "".into(), + "pair".into(), + "".into() + ], + vec![None, None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 0), (0, 4), (0, 0)], + vec![1, 0, 0, 1, 1, 0, 1], + vec![1, 1, 1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3), (1, 5..6)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(2), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(3), None); + assert_eq!(pair_encoding.token_to_sequence(4), None); + assert_eq!(pair_encoding.token_to_sequence(5), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(6), None); + + // No special tokens + let pair_encoding = processor.process(encoding, Some(pair), false).unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![12, 14, 15], + vec![0, 0, 1], + vec!["Hello".into(), "there".into(), "pair".into(),], + vec![None, None, None], + vec![(0, 5), (6, 11), (0, 4)], + vec![0, 0, 0], + vec![1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(0), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(1), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(2), Some(1)); + } } diff --git a/tokenizers/src/processors/sequence.rs b/tokenizers/src/processors/sequence.rs index 21d58d416..66c670ad8 100644 --- a/tokenizers/src/processors/sequence.rs +++ b/tokenizers/src/processors/sequence.rs @@ -47,7 +47,7 @@ mod tests { fn process_chain() { let start = Encoding::new( vec![0; 5], - vec![], + vec![0; 5], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), @@ -67,7 +67,7 @@ mod tests { let sequence = Sequence::new(vec![PostProcessorWrapper::ByteLevel(bytelevel)]); let expected = Encoding::new( vec![0; 5], - vec![], + vec![0; 5], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), @@ -94,7 +94,7 @@ mod tests { let pair_expected = Encoding::new( vec![0; 10], - vec![], + vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1], vec![ "Ġ".into(), "ĠĠĠĠHelloĠĠ".into(), diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 6ccec4d73..ed5d05170 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -100,14 +100,17 @@ pub trait PostProcessor { pair_encoding: Option, add_special_tokens: bool, ) -> Result { - let encodings = if let Some(pair_encoding) = pair_encoding { + let mut encodings = if let Some(pair_encoding) = pair_encoding { vec![encoding, pair_encoding] } else { vec![encoding] }; + encodings.iter_mut().enumerate().for_each(|(i, encoding)| { + encoding.set_sequence_id(i); + encoding.set_type_ids(vec![i as u32; encoding.len()]); + }); let encodings = self.process_encodings(encodings, add_special_tokens)?; - Ok(Encoding::merge(encodings, false)) } From 0c2f6461d4f3851da83e44fee3bf05a785c10539 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 31 Aug 2022 19:46:22 +0200 Subject: [PATCH 3/4] Fixing the overflowing sequence_id issue (LayoutLMv2 tests caught this). --- tokenizers/src/processors/template.rs | 73 +++++++++++++++++++++++++++ tokenizers/src/tokenizer/encoding.rs | 2 + tokenizers/src/tokenizer/mod.rs | 4 ++ 3 files changed, 79 insertions(+) diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 262323cfa..72d537729 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -885,6 +885,79 @@ mod tests { assert_eq!(pair_encoding.token_to_sequence(5), None); } + #[test] + fn template_processing_overflowing() { + let processor = tests::get_bert_template(); + assert_eq!(processor.added_tokens(false), 2); + assert_eq!(processor.added_tokens(true), 3); + + use crate::Token; + let encoding = Encoding::from_tokens( + vec![ + Token::new(12, "Hello".into(), (0, 5)), + Token::new(14, "there".into(), (6, 11)), + Token::new(13, "you".into(), (12, 15)), + ], + 0, + ); + let pair = Encoding::from_tokens( + vec![ + Token::new(15, "pair".into(), (0, 4)), + Token::new(16, "with".into(), (5, 9)), + Token::new(17, "info".into(), (10, 14)), + ], + 0, + ); + + let single_encoding = processor.process(encoding.clone(), None, true).unwrap(); + assert_eq!( + single_encoding, + Encoding::new( + vec![1, 12, 14, 0], + vec![0, 0, 0, 0], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into() + ], + vec![None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0)], + vec![1, 0, 0, 1], + vec![1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3)]), + ) + ); + assert_eq!(single_encoding.token_to_sequence(2), Some(0)); + assert_eq!(single_encoding.token_to_sequence(3), None); + let pair_encoding = processor.process(encoding, Some(pair), true).unwrap(); + assert_eq!( + pair_encoding, + Encoding::new( + vec![1, 12, 14, 0, 15, 0], + vec![0, 0, 0, 0, 1, 1], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into(), + "pair".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (0, 0)], + vec![1, 0, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]), + ) + ); + assert_eq!(pair_encoding.token_to_sequence(2), Some(0)); + assert_eq!(pair_encoding.token_to_sequence(3), None); + assert_eq!(pair_encoding.token_to_sequence(4), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(5), None); + } #[test] fn pair_must_use_both_sequences() { let processor = TemplateProcessing::builder() diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs index b1b4e03c0..fe7a35d2a 100644 --- a/tokenizers/src/tokenizer/encoding.rs +++ b/tokenizers/src/tokenizer/encoding.rs @@ -405,6 +405,8 @@ impl Encoding { // Handle merging the overflowing parts too: Combine them all // In most of the cases, we expect `pair.overflowing.len() == 0` let mut overflowings = vec![]; + println!("Overflowing self {:?}", self.overflowing); + println!("Overflowing pair {:?}", pair.overflowing); // 1. All our overflowings with all the others for self_o in &self.overflowing { diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index ed5d05170..a1075e1e0 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -107,6 +107,10 @@ pub trait PostProcessor { }; encodings.iter_mut().enumerate().for_each(|(i, encoding)| { encoding.set_sequence_id(i); + encoding + .get_overflowing_mut() + .iter_mut() + .for_each(|encoding| encoding.set_sequence_id(i)); encoding.set_type_ids(vec![i as u32; encoding.len()]); }); From 1a7d0ae9e8071e9b5cbd07663043673e624c76b8 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 31 Aug 2022 20:50:50 +0200 Subject: [PATCH 4/4] Adding overly complex overflowing test. --- tokenizers/src/processors/template.rs | 131 +++++++++++++++++++++++--- tokenizers/src/tokenizer/encoding.rs | 6 +- 2 files changed, 121 insertions(+), 16 deletions(-) diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 72d537729..13bc91e56 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -892,22 +892,26 @@ mod tests { assert_eq!(processor.added_tokens(true), 3); use crate::Token; - let encoding = Encoding::from_tokens( + let mut encoding = Encoding::from_tokens( vec![ Token::new(12, "Hello".into(), (0, 5)), Token::new(14, "there".into(), (6, 11)), - Token::new(13, "you".into(), (12, 15)), ], 0, ); - let pair = Encoding::from_tokens( + let overflowing = Encoding::from_tokens(vec![Token::new(13, "you".into(), (12, 15))], 0); + encoding.set_overflowing(vec![overflowing]); + + let mut pair = Encoding::from_tokens( vec![ Token::new(15, "pair".into(), (0, 4)), Token::new(16, "with".into(), (5, 9)), - Token::new(17, "info".into(), (10, 14)), ], 0, ); + let pair_overflowing = + Encoding::from_tokens(vec![Token::new(17, "info".into(), (10, 14))], 0); + pair.set_overflowing(vec![pair_overflowing]); let single_encoding = processor.process(encoding.clone(), None, true).unwrap(); assert_eq!( @@ -925,38 +929,137 @@ mod tests { vec![(0, 0), (0, 5), (6, 11), (0, 0)], vec![1, 0, 0, 1], vec![1, 1, 1, 1], - vec![], + vec![Encoding::new( + vec![1, 13, 0], + vec![0, 0, 0], + vec!["[CLS]".into(), "you".into(), "[SEP]".into()], + vec![None, None, None], + vec![(0, 0), (12, 15), (0, 0)], + vec![1, 0, 1], + vec![1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2)]), + )], HashMap::from_iter(vec![(0, 1..3)]), ) ); assert_eq!(single_encoding.token_to_sequence(2), Some(0)); assert_eq!(single_encoding.token_to_sequence(3), None); let pair_encoding = processor.process(encoding, Some(pair), true).unwrap(); + println!("{pair_encoding:#?}"); assert_eq!( pair_encoding, Encoding::new( - vec![1, 12, 14, 0, 15, 0], - vec![0, 0, 0, 0, 1, 1], + vec![1, 12, 14, 0, 15, 16, 0], + vec![0, 0, 0, 0, 1, 1, 1], vec![ "[CLS]".into(), "Hello".into(), "there".into(), "[SEP]".into(), "pair".into(), + "with".into(), "[SEP]".into() ], - vec![None, None, None, None, None, None], - vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (0, 0)], - vec![1, 0, 0, 1, 0, 1], - vec![1, 1, 1, 1, 1, 1], - vec![], - HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]), + vec![None, None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (5, 9), (0, 0)], + vec![1, 0, 0, 1, 0, 0, 1], + vec![1, 1, 1, 1, 1, 1, 1], + vec![ + Encoding::new( + vec![1, 13, 0, 15, 16, 0], + vec![0, 0, 0, 1, 1, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "pair".into(), + "with".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None], + vec![(0, 0), (12, 15), (0, 0), (0, 4), (5, 9), (0, 0)], + vec![1, 0, 1, 0, 0, 1], + vec![1, 1, 1, 1, 1, 1], + vec![Encoding::new( + vec![1, 13, 0, 17, 0], + vec![0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None,], + vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]), + ),], + HashMap::from_iter(vec![(1, 3..5), (0, 1..2)]), + ), + Encoding::new( + vec![1, 13, 0, 17, 0], + vec![0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None,], + vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]), + ), + Encoding::new( + vec![1, 12, 14, 0, 17, 0], + vec![0, 0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "Hello".into(), + "there".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None, None], + vec![(0, 0), (0, 5), (6, 11), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1, 1], + vec![Encoding::new( + vec![1, 13, 0, 17, 0], + vec![0, 0, 0, 0, 1], + vec![ + "[CLS]".into(), + "you".into(), + "[SEP]".into(), + "info".into(), + "[SEP]".into() + ], + vec![None, None, None, None, None,], + vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)], + vec![1, 0, 1, 0, 1], + vec![1, 1, 1, 1, 1], + vec![], + HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]), + ),], + HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]), + ) + ], + HashMap::from_iter(vec![(0, 1..3), (1, 4..6)]), ) ); assert_eq!(pair_encoding.token_to_sequence(2), Some(0)); assert_eq!(pair_encoding.token_to_sequence(3), None); assert_eq!(pair_encoding.token_to_sequence(4), Some(1)); - assert_eq!(pair_encoding.token_to_sequence(5), None); + assert_eq!(pair_encoding.token_to_sequence(5), Some(1)); + assert_eq!(pair_encoding.token_to_sequence(6), None); } #[test] fn pair_must_use_both_sequences() { diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs index fe7a35d2a..c6274c2f2 100644 --- a/tokenizers/src/tokenizer/encoding.rs +++ b/tokenizers/src/tokenizer/encoding.rs @@ -176,6 +176,10 @@ impl Encoding { &self.overflowing } + pub fn set_overflowing(&mut self, overflowing: Vec) { + self.overflowing = overflowing; + } + pub fn get_overflowing_mut(&mut self) -> &mut Vec { &mut self.overflowing } @@ -405,8 +409,6 @@ impl Encoding { // Handle merging the overflowing parts too: Combine them all // In most of the cases, we expect `pair.overflowing.len() == 0` let mut overflowings = vec![]; - println!("Overflowing self {:?}", self.overflowing); - println!("Overflowing pair {:?}", pair.overflowing); // 1. All our overflowings with all the others for self_o in &self.overflowing {