From c84713605e154996bfbc9559adf7eab7e8aa9755 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 20:50:07 +0200
Subject: [PATCH 01/11] Makes `decode` and `decode_batch` work on borrowed
 content.

---
 tokenizers/src/cli.rs             | 2 +-
 tokenizers/src/tokenizer/mod.rs   | 8 ++++----
 tokenizers/tests/documentation.rs | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tokenizers/src/cli.rs b/tokenizers/src/cli.rs
index 6bf523ef8..54b82357f 100644
--- a/tokenizers/src/cli.rs
+++ b/tokenizers/src/cli.rs
@@ -59,7 +59,7 @@ fn shell(vocab: &str, merges: &str) -> Result<()> {
         println!("Offsets:\t{:?}", encoded.get_offsets());
         println!(
             "Decoded:\t{}",
-            tokenizer.decode(encoded.get_ids().to_vec(), true).unwrap()
+            tokenizer.decode(encoded.get_ids(), true).unwrap()
         );
         println!("Tokenized in {:?}", elapsed);
     }
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index a88306f3a..01ec187cd 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -795,12 +795,12 @@ where
     }
 
     /// Decode the given ids, back to a String
-    pub fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> Result<String> {
+    pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
         let tokens = ids
-            .into_iter()
+            .iter()
             .filter_map(|id| {
                 self.added_vocabulary
-                    .id_to_token(id, &self.model)
+                    .id_to_token(*id, &self.model)
                     .filter(|token| {
                         !skip_special_tokens || !self.added_vocabulary.is_special_token(token)
                     })
@@ -1008,7 +1008,7 @@ where
     /// Decode all sentences in parallel
     pub fn decode_batch(
         &self,
-        sentences: Vec<Vec<u32>>,
+        sentences: &[&[u32]],
         skip_special_tokens: bool,
     ) -> Result<Vec<String>>
     where
diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs
index 605f8a4bd..629360c0f 100644
--- a/tokenizers/tests/documentation.rs
+++ b/tokenizers/tests/documentation.rs
@@ -54,7 +54,7 @@ fn load_tokenizer() {
     assert_eq!(encodings.get_ids(), ids);
     assert_eq!(encodings.get_tokens(), tokens);
 
-    let decoded = tokenizer.decode(ids, false).unwrap();
+    let decoded = tokenizer.decode(&ids, false).unwrap();
     assert_eq!(decoded, example);
 }
 
@@ -347,7 +347,7 @@ fn pipeline() -> tokenizers::Result<()> {
     // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
 
     let decoded = tokenizer.decode(
-        vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
+        &vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
         true,
     )?;
     println!("{}", decoded);
@@ -435,7 +435,7 @@ fn pipeline_bert() -> tokenizers::Result<()> {
     println!("{:?}", output.get_tokens());
     // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
 
-    let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
+    let decoded = bert_tokenizer.decode(output.get_ids(), true)?;
     println!("{}", decoded);
     // "welcome to the tok ##eni ##zer ##s library ."
     // END bert_test_decoding
@@ -451,7 +451,7 @@ fn pipeline_bert() -> tokenizers::Result<()> {
     use tokenizers::decoders::wordpiece::WordPiece as WordPieceDecoder;
 
     bert_tokenizer.with_decoder(WordPieceDecoder::default());
-    let decoded = bert_tokenizer.decode(output.get_ids().to_vec(), true)?;
+    let decoded = bert_tokenizer.decode(output.get_ids(), true)?;
     // "welcome to the tokenizers library."
     // END bert_proper_decoding
     assert_eq!(decoded, "welcome to the tokenizers library.");

From 4d5e55fb6492959851c182074f4db5427e06dea2 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 21:14:17 +0200
Subject: [PATCH 02/11] Make `decode_batch` work with borrowed content.

---
 bindings/python/src/tokenizer.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index 95a954a27..1fe296ed0 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -1009,7 +1009,7 @@ impl PyTokenizer {
     #[pyo3(signature = (ids, skip_special_tokens = true))]
     #[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")]
     fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
-        ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
+        ToPyResult(self.tokenizer.decode(&ids, skip_special_tokens)).into()
     }
 
     /// Decode a batch of ids back to their corresponding string
@@ -1032,7 +1032,8 @@ impl PyTokenizer {
         skip_special_tokens: bool,
     ) -> PyResult<Vec<String>> {
         py.allow_threads(|| {
-            ToPyResult(self.tokenizer.decode_batch(sequences, skip_special_tokens)).into()
+            let slices = sequences.iter().map(|v| &v[..]).collect::<Vec<&[u32]>>();
+            ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into()
         })
     }
 

From 1d5bb4430eae96f3f71b199e9411d60d42cff0dd Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 21:16:32 +0200
Subject: [PATCH 03/11] Fix lint.

---
 tokenizers/tests/documentation.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs
index 629360c0f..7cf04debe 100644
--- a/tokenizers/tests/documentation.rs
+++ b/tokenizers/tests/documentation.rs
@@ -347,7 +347,7 @@ fn pipeline() -> tokenizers::Result<()> {
     // [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
 
     let decoded = tokenizer.decode(
-        &vec![1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
+        &[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
         true,
     )?;
     println!("{}", decoded);

From 6172ee76e9acba1a1ede96fbd5df04315879a814 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 21:28:42 +0200
Subject: [PATCH 04/11] Attempt to map it into Node.

---
 bindings/node/native/src/tasks/tokenizer.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs
index 2a7e3e0ad..f6e831ab4 100644
--- a/bindings/node/native/src/tasks/tokenizer.rs
+++ b/bindings/node/native/src/tasks/tokenizer.rs
@@ -106,14 +106,14 @@ impl Task for DecodeTask {
                 .tokenizer
                 .read()
                 .unwrap()
-                .decode(ids.to_vec(), *skip_special_tokens)
+                .decode(ids.as_slice(), *skip_special_tokens)
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Single),
             DecodeTask::Batch(worker, ids, skip_special_tokens) => worker
                 .tokenizer
                 .read()
                 .unwrap()
-                .decode_batch(ids.to_vec(), *skip_special_tokens)
+                .decode_batch(&ids.iter().map(|v| v.as_slice()).collect(), *skip_special_tokens)
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Batch),
         }

From 9ccb05b84db78847d8ea920405275c3ab0b18471 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 21:39:59 +0200
Subject: [PATCH 05/11] Second attempt.

---
 bindings/node/native/src/tasks/tokenizer.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs
index f6e831ab4..f997fc16d 100644
--- a/bindings/node/native/src/tasks/tokenizer.rs
+++ b/bindings/node/native/src/tasks/tokenizer.rs
@@ -106,14 +106,14 @@ impl Task for DecodeTask {
                 .tokenizer
                 .read()
                 .unwrap()
-                .decode(ids.as_slice(), *skip_special_tokens)
+                .decode(&ids, *skip_special_tokens)
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Single),
             DecodeTask::Batch(worker, ids, skip_special_tokens) => worker
                 .tokenizer
                 .read()
                 .unwrap()
-                .decode_batch(&ids.iter().map(|v| v.as_slice()).collect(), *skip_special_tokens)
+                .decode_batch(&ids.iter().map(|v| &v).collect(), *skip_special_tokens)
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Batch),
         }

From 21253df577add37d49ca5009d9e09e5d43541767 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 21:47:02 +0200
Subject: [PATCH 06/11] Step by step.

---
 bindings/node/native/src/tasks/tokenizer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs
index f997fc16d..3f8dabd16 100644
--- a/bindings/node/native/src/tasks/tokenizer.rs
+++ b/bindings/node/native/src/tasks/tokenizer.rs
@@ -106,7 +106,7 @@ impl Task for DecodeTask {
                 .tokenizer
                 .read()
                 .unwrap()
-                .decode(&ids, *skip_special_tokens)
+                .decode(ids.as_slice(), *skip_special_tokens)
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Single),
             DecodeTask::Batch(worker, ids, skip_special_tokens) => worker

From 6164f9cc0b2eefdf01526f9559b8d2f71d9a7da1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 21:57:41 +0200
Subject: [PATCH 07/11] One more step.

---
 bindings/node/native/src/tasks/tokenizer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs
index 3f8dabd16..25f7f1512 100644
--- a/bindings/node/native/src/tasks/tokenizer.rs
+++ b/bindings/node/native/src/tasks/tokenizer.rs
@@ -113,7 +113,7 @@ impl Task for DecodeTask {
                 .tokenizer
                 .read()
                 .unwrap()
-                .decode_batch(&ids.iter().map(|v| &v).collect(), *skip_special_tokens)
+                .decode_batch(&ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(), *skip_special_tokens)
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Batch),
         }

From d491913248ce739f189aa1fbcd8cc6f28a90724d Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 22:08:36 +0200
Subject: [PATCH 08/11] Fix lint.

---
 bindings/node/native/src/tasks/tokenizer.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs
index 25f7f1512..0d6a2ae22 100644
--- a/bindings/node/native/src/tasks/tokenizer.rs
+++ b/bindings/node/native/src/tasks/tokenizer.rs
@@ -113,7 +113,10 @@ impl Task for DecodeTask {
                 .tokenizer
                 .read()
                 .unwrap()
-                .decode_batch(&ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(), *skip_special_tokens)
+                .decode_batch(
+                    &ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(),
+                    *skip_special_tokens
+                )
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Batch),
         }

From 774c357a2e417ed4909b55fde2d1c54338ef6019 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Tue, 16 May 2023 22:16:42 +0200
Subject: [PATCH 09/11] Please ...

---
 bindings/node/native/src/tasks/tokenizer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/node/native/src/tasks/tokenizer.rs b/bindings/node/native/src/tasks/tokenizer.rs
index 0d6a2ae22..495ae53a7 100644
--- a/bindings/node/native/src/tasks/tokenizer.rs
+++ b/bindings/node/native/src/tasks/tokenizer.rs
@@ -115,7 +115,7 @@ impl Task for DecodeTask {
                 .unwrap()
                 .decode_batch(
                     &ids.iter().map(|v| v.as_slice()).collect::<Vec<&[u32]>>(),
-                    *skip_special_tokens
+                    *skip_special_tokens,
                 )
                 .map_err(|e| format!("{}", e))
                 .map(DecodeOutput::Batch),

From 2f7ec04dc84df3cc5488625a4fcb492fdc3545e2 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 17 May 2023 10:20:34 +0200
Subject: [PATCH 10/11] Removing collect.

---
 bindings/python/src/tokenizer.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index 1fe296ed0..97a2c6466 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -1028,12 +1028,11 @@ impl PyTokenizer {
     fn decode_batch(
         &self,
         py: Python<'_>,
-        sequences: Vec<Vec<u32>>,
+        sequences: &[&[u32]],
         skip_special_tokens: bool,
     ) -> PyResult<Vec<String>> {
         py.allow_threads(|| {
-            let slices = sequences.iter().map(|v| &v[..]).collect::<Vec<&[u32]>>();
-            ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into()
+            ToPyResult(self.tokenizer.decode_batch(slices, skip_special_tokens)).into()
         })
     }
 

From 500848c424721fb6b66a39e738f2c11401e832ca Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 17 May 2023 10:59:16 +0200
Subject: [PATCH 11/11] Revert "Removing collect."

This reverts commit 2f7ec04dc84df3cc5488625a4fcb492fdc3545e2.
---
 bindings/python/src/tokenizer.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index 97a2c6466..1fe296ed0 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -1028,11 +1028,12 @@ impl PyTokenizer {
     fn decode_batch(
         &self,
         py: Python<'_>,
-        sequences: &[&[u32]],
+        sequences: Vec<Vec<u32>>,
         skip_special_tokens: bool,
     ) -> PyResult<Vec<String>> {
         py.allow_threads(|| {
-            ToPyResult(self.tokenizer.decode_batch(slices, skip_special_tokens)).into()
+            let slices = sequences.iter().map(|v| &v[..]).collect::<Vec<&[u32]>>();
+            ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into()
         })
     }