From c84713605e154996bfbc9559adf7eab7e8aa9755 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 20:50:07 +0200 Subject: [PATCH 01/11] Makes `decode` and `decode_batch` work on borrowed content. --- tokenizers/src/cli.rs | 2 +- tokenizers/src/tokenizer/mod.rs | 8 ++++---- tokenizers/tests/documentation.rs | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tokenizers/src/cli.rs b/tokenizers/src/cli.rs index 6bf523ef8..54b82357f 100644 --- a/tokenizers/src/cli.rs +++ b/tokenizers/src/cli.rs @@ -59,7 +59,7 @@ fn shell(vocab: &str, merges: &str) -> Result<()> { println!("Offsets:\t{:?}", encoded.get_offsets()); println!( "Decoded:\t{}", - tokenizer.decode(encoded.get_ids().to_vec(), true).unwrap() + tokenizer.decode(encoded.get_ids(), true).unwrap() ); println!("Tokenized in {:?}", elapsed); } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index a88306f3a..01ec187cd 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -795,12 +795,12 @@ where } /// Decode the given ids, back to a String - pub fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> Result<String> { + pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> { let tokens = ids - .into_iter() + .iter() .filter_map(|id| { self.added_vocabulary - .id_to_token(id, &self.model) + .id_to_token(*id, &self.model) .filter(|token| { !skip_special_tokens || !self.added_vocabulary.is_special_token(token) }) @@ -1008,7 +1008,7 @@ where /// Decode all sentences in parallel pub fn decode_batch( &self, - sentences: Vec<Vec<u32>>, + sentences: &[&[u32]], skip_special_tokens: bool, ) -> Result<Vec<String>> where diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index 605f8a4bd..629360c0f 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -54,7 +54,7 @@ fn load_tokenizer() { assert_eq!(encodings.get_ids(), ids); assert_eq!(encodings.get_tokens(), tokens); - let decoded = tokenizer.decode(ids, false).unwrap(); + let decoded = tokenizer.decode(&ids, false).unwrap(); assert_eq!(decoded, example); } @@ -347,7 +347,7 @@ fn pipeline() -> tokenizers::Result<()> { // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2] let decoded = tokenizer.decode( - vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], + &vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], true, )?; println!("{}", decoded); @@ -435,7 +435,7 @@ fn pipeline_bert() -> tokenizers::Result<()> { println!("{:?}", output.get_tokens()); // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] - let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?; + let decoded = bert_tokenizer.decode(output.get_ids(), true)?; println!("{}", decoded); // "welcome to the tok ##eni ##zer ##s library ." // END bert_test_decoding @@ -451,7 +451,7 @@ fn pipeline_bert() -> tokenizers::Result<()> { use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder; bert_tokenizer.with_decoder(WordPieceDecoder::default()); - let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?; + let decoded = bert_tokenizer.decode(output.get_ids(), true)?; // "welcome to the tokenizers library." // END bert_proper_decoding assert_eq!(decoded, "welcome to the tokenizers library."); From 4d5e55fb6492959851c182074f4db5427e06dea2 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 21:14:17 +0200 Subject: [PATCH 02/11] Make `decode_batch` work with borrowed content. --- bindings/python/src/tokenizer.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 95a954a27..1fe296ed0 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1009,7 +1009,7 @@ impl PyTokenizer { #[pyo3(signature = (ids, skip_special_tokens = true))] #[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")] fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> { - ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into() + ToPyResult(self.tokenizer.decode(&ids, skip_special_tokens)).into() } /// Decode a batch of ids back to their corresponding string @@ -1032,7 +1032,8 @@ impl PyTokenizer { skip_special_tokens: bool, ) -> PyResult<Vec<String>> { py.allow_threads(|| { - ToPyResult(self.tokenizer.decode_batch(sequences, skip_special_tokens)).into() + let slices = sequences.iter().map(|v| &v[..]).collect::<Vec<&[u32]>>(); + ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into() }) } From 1d5bb4430eae96f3f71b199e9411d60d42cff0dd Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 21:16:32 +0200 Subject: [PATCH 03/11] Fix lint. --- tokenizers/tests/documentation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index 629360c0f..7cf04debe 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -347,7 +347,7 @@ fn pipeline() -> tokenizers::Result<()> { // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2] let decoded = tokenizer.decode( - &vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], + &[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], true, )?; println!("{}", decoded); From 6172ee76e9acba1a1ede96fbd5df04315879a814 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 21:28:42 +0200 Subject: [PATCH 04/11] Attempt to map it into Node. --- bindings/node/native/src/tasks/tokenizer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs index 2a7e3e0ad..f6e831ab4 100644 --- a/bindings/node/native/src/tasks/tokenizer.rs +++ b/bindings/node/native/src/tasks/tokenizer.rs @@ -106,14 +106,14 @@ impl Task for DecodeTask { .tokenizer .read() .unwrap() - .decode(ids.to_vec(), *skip_special_tokens) + .decode(ids.as_slice(), *skip_special_tokens) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Single), DecodeTask::Batch(worker, ids, skip_special_tokens) => worker .tokenizer .read() .unwrap() - .decode_batch(ids.to_vec(), *skip_special_tokens) + .decode_batch(&ids.iter().map(|v| v.as_slice()).collect(), *skip_special_tokens) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Batch), } From 9ccb05b84db78847d8ea920405275c3ab0b18471 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 21:39:59 +0200 Subject: [PATCH 05/11] Second attempt. --- bindings/node/native/src/tasks/tokenizer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs index f6e831ab4..f997fc16d 100644 --- a/bindings/node/native/src/tasks/tokenizer.rs +++ b/bindings/node/native/src/tasks/tokenizer.rs @@ -106,14 +106,14 @@ impl Task for DecodeTask { .tokenizer .read() .unwrap() - .decode(ids.as_slice(), *skip_special_tokens) + .decode(&ids, *skip_special_tokens) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Single), DecodeTask::Batch(worker, ids, skip_special_tokens) => worker .tokenizer .read() .unwrap() - .decode_batch(&ids.iter().map(|v| v.as_slice()).collect(), *skip_special_tokens) + .decode_batch(&ids.iter().map(|v| &v).collect(), *skip_special_tokens) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Batch), } From 21253df577add37d49ca5009d9e09e5d43541767 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 21:47:02 +0200 Subject: [PATCH 06/11] Step by step. --- bindings/node/native/src/tasks/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs index f997fc16d..3f8dabd16 100644 --- a/bindings/node/native/src/tasks/tokenizer.rs +++ b/bindings/node/native/src/tasks/tokenizer.rs @@ -106,7 +106,7 @@ impl Task for DecodeTask { .tokenizer .read() .unwrap() - .decode(&ids, *skip_special_tokens) + .decode(ids.as_slice(), *skip_special_tokens) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Single), DecodeTask::Batch(worker, ids, skip_special_tokens) => worker From 6164f9cc0b2eefdf01526f9559b8d2f71d9a7da1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 21:57:41 +0200 Subject: [PATCH 07/11] One more step. --- bindings/node/native/src/tasks/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs index 3f8dabd16..25f7f1512 100644 --- a/bindings/node/native/src/tasks/tokenizer.rs +++ b/bindings/node/native/src/tasks/tokenizer.rs @@ -113,7 +113,7 @@ impl Task for DecodeTask { .tokenizer .read() .unwrap() - .decode_batch(&ids.iter().map(|v| &v).collect(), *skip_special_tokens) + .decode_batch(&ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(), *skip_special_tokens) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Batch), } From d491913248ce739f189aa1fbcd8cc6f28a90724d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 22:08:36 +0200 Subject: [PATCH 08/11] Fix lint. --- bindings/node/native/src/tasks/tokenizer.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs index 25f7f1512..0d6a2ae22 100644 --- a/bindings/node/native/src/tasks/tokenizer.rs +++ b/bindings/node/native/src/tasks/tokenizer.rs @@ -113,7 +113,10 @@ impl Task for DecodeTask { .tokenizer .read() .unwrap() - .decode_batch(&ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(), *skip_special_tokens) + .decode_batch( + &ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(), + *skip_special_tokens + ) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Batch), } From 774c357a2e417ed4909b55fde2d1c54338ef6019 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz <funtowiczmo@gmail.com> Date: Tue, 16 May 2023 22:16:42 +0200 Subject: [PATCH 09/11] Please ... --- bindings/node/native/src/tasks/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs index 0d6a2ae22..495ae53a7 100644 --- a/bindings/node/native/src/tasks/tokenizer.rs +++ b/bindings/node/native/src/tasks/tokenizer.rs @@ -115,7 +115,7 @@ impl Task for DecodeTask { .unwrap() .decode_batch( &ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(), - *skip_special_tokens + *skip_special_tokens, ) .map_err(|e| format!("{}", e)) .map(DecodeOutput::Batch), From 2f7ec04dc84df3cc5488625a4fcb492fdc3545e2 Mon Sep 17 00:00:00 2001 From: Nicolas Patry <patry.nicolas@protonmail.com> Date: Wed, 17 May 2023 10:20:34 +0200 Subject: [PATCH 10/11] Removing collect. --- bindings/python/src/tokenizer.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 1fe296ed0..97a2c6466 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1028,12 +1028,11 @@ impl PyTokenizer { fn decode_batch( &self, py: Python<'_>, - sequences: Vec<Vec<u32>>, + sequences: &[&[u32]], skip_special_tokens: bool, ) -> PyResult<Vec<String>> { py.allow_threads(|| { - let slices = sequences.iter().map(|v| &v[..]).collect::<Vec<&[u32]>>(); - ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into() + ToPyResult(self.tokenizer.decode_batch(slices, skip_special_tokens)).into() }) } From 500848c424721fb6b66a39e738f2c11401e832ca Mon Sep 17 00:00:00 2001 From: Nicolas Patry <patry.nicolas@protonmail.com> Date: Wed, 17 May 2023 10:59:16 +0200 Subject: [PATCH 11/11] Revert "Removing collect." This reverts commit 2f7ec04dc84df3cc5488625a4fcb492fdc3545e2. --- bindings/python/src/tokenizer.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 97a2c6466..1fe296ed0 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1028,11 +1028,12 @@ impl PyTokenizer { fn decode_batch( &self, py: Python<'_>, - sequences: &[&[u32]], + sequences: Vec<Vec<u32>>, skip_special_tokens: bool, ) -> PyResult<Vec<String>> { py.allow_threads(|| { - ToPyResult(self.tokenizer.decode_batch(slices, skip_special_tokens)).into() + let slices = sequences.iter().map(|v| &v[..]).collect::<Vec<&[u32]>>(); + ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into() }) }