From d09fe48494d01e6699a4efafcc4a2893c7f9d24d Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 23 Oct 2024 01:01:37 -0600
Subject: [PATCH 01/10] fix mistral vocab usage

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 80e21c2d32ecc..9987c3529345a 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -72,18 +72,12 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
             # Make sure special tokens will not raise
             tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
 
-            self._vocab = {
-                token: idx
-                for idx, token in enumerate(tokenizer_.vocab())
-            }
         elif isinstance(tokenizer_, SentencePieceTokenizer):
-            self._vocab = {
-                token: idx
-                for idx, token in enumerate(tokenizer_.vocab())
-            }
+            pass
         else:
             raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
 
+        self._vocab = tokenizer_.vocab()
         self.tokenizer = tokenizer_
         self._max_token_id = max(self._vocab.values())
 
@@ -182,7 +176,10 @@ def __call__(
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
-        return self._vocab
+        # Convert to a Dict[str, int] to match protocol, but this is a lossy
+        # conversion. There may be multiple token ids that decode to the same
+        # string due to partial UTF-8 byte sequences being converted to �
+        return {token: idx for idx, token in enumerate(self._vocab)}
 
     def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary

From 986fbd9c38f7cb18a994bddd35d7d178fff13693 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 23 Oct 2024 01:04:59 -0600
Subject: [PATCH 02/10] fix: convert any token with an incomplete character to
 bytes

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 9987c3529345a..9c33191e5d338 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -254,9 +254,10 @@ def convert_ids_to_tokens(
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
-        if any(t.strip() == "�" for t in tokens):
-            # if any stripped decoded token is undefined
-            # because it's invalid unicode then pass bytes
+        if any("�" in t for t in tokens):
+            # if a decoded token contains the replacement character, then the
+            # token has an incomplete UTF-8 character so we must use a byte
+            # string to avoid losing information
             # See: https://github.com/vllm-project/vllm/pull/8640
             tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
 

From 4c3f0143c41bfdc6cb752ef9447b4fc22c4b3c16 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 23 Oct 2024 10:18:16 -0600
Subject: [PATCH 03/10] catch and log about invalid tokens in
 convert_tokens_to_string

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 9c33191e5d338..a911c2793ccf5 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -16,9 +16,13 @@
 from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
                                                      Tekkenizer)
 
+from vllm.logger import init_logger
+
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class Encoding:
@@ -217,14 +221,20 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
             if any(isinstance(t, bytes) for t in tokens):
                 # we need to encode and decode all tokens again
                 shift = self.tokenizer.num_special_tokens
-                byte_tokens = [
-                    t.encode("utf-8") if not isinstance(t, bytes) else t
-                    for t in tokens
-                ]
-                ids = [
-                    self.tokenizer._tekken_token2id_nospecial[t] + shift
-                    for t in byte_tokens
-                ]
+
+                def _token_to_id(t: str):
+                    t_bytes = t.encode("utf-8") \
+                        if not isinstance(t, bytes) else t
+                    try:
+                        return shift + \
+                            self.tokenizer._tekken_token2id_nospecial[t_bytes]
+                    except KeyError:
+                        logger.warning(
+                            "Failed to convert token %s to id,"
+                            " replacing with <unk>", t_bytes)
+                        return self.tokenizer.unk_id
+
+                ids = [_token_to_id(t) for t in tokens]
                 decoded = self.tokenizer.decode(ids)
             else:
                 decoded = "".join(tokens)

From 888be8205cd90259e6aa6f6278ab91e4abcef36b Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta@us.ibm.com>
Date: Wed, 23 Oct 2024 13:39:01 -0700
Subject: [PATCH 04/10] =?UTF-8?q?=E2=9C=85=20add=20mistral=20model=20to=20?=
 =?UTF-8?q?test=5Fdetokenize?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
---
 tests/tokenization/test_detokenize.py | 60 ++++++++++++++++++---------
 1 file changed, 41 insertions(+), 19 deletions(-)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index f4551ed42efb8..efb366bf452d5 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Generator, List, Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -7,11 +7,13 @@
 from vllm.transformers_utils.detokenizer import (Detokenizer,
                                                  detokenize_incrementally)
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 TRUTH = [
     "Hello here, this is a simple test",
     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
-    "我很感谢你的热情"
+    "我很感谢你的热情",
+    "THIS IS AN URGENCY",
 ]
 TOKENIZERS = [
     "facebook/opt-125m",
@@ -24,6 +26,7 @@
     "tiiuae/falcon-7b",
     "meta-llama/Llama-2-7b-hf",
     "codellama/CodeLlama-7b-hf",
+    "mistralai/Pixtral-12B-2409",
 ]
 
 
@@ -49,26 +52,42 @@ def _run_incremental_decode(tokenizer, all_input_ids,
     return decoded_text
 
 
+@pytest.fixture
+def tokenizer(tokenizer_name):
+    return (MistralTokenizer.from_pretrained(tokenizer_name)
+            if "mistral" in tokenizer_name else
+            AutoTokenizer.from_pretrained(tokenizer_name))
+
+
+@pytest.fixture
+def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
+    if "mistral" in tokenizer_name:
+        yield (
+            bool(True) if request.param else
+            pytest.skip("mistral doesn't support skip_special_tokens=False"))
+    else:
+        yield bool(True) if request.param else bool(False)
+
+
 @pytest.mark.parametrize("truth", TRUTH)
 @pytest.mark.parametrize("with_prompt", [True, False])
-@pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", (True, False))
-def test_decode_streaming(tokenizer_id, truth, with_prompt,
-                          skip_special_tokens):
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
     if with_prompt:
-        truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"]
+        truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
         prompt_input_ids = truth_tokens[:len(truth) // 2]
         generated_input_ids = truth_tokens[len(truth) // 2:]
         all_input_ids = prompt_input_ids + generated_input_ids
         starting_index = len(prompt_input_ids)
-        prompt = tokenizer.decode(prompt_input_ids,
-                                  skip_special_tokens=skip_special_tokens)
+        prompt = (tokenizer.decode(prompt_input_ids) if isinstance(
+            tokenizer, MistralTokenizer) else tokenizer.decode(
+                prompt_input_ids, skip_special_tokens=skip_special_tokens))
         generated = truth[len(prompt):]
     else:
         generated = truth
         starting_index = 0
-        all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
+        all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
     if skip_special_tokens:
         if tokenizer.bos_token_id is not None:
             all_input_ids = [tokenizer.bos_token_id] + all_input_ids
@@ -98,7 +117,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
         enable_lora=False,
         max_num_seqs=100,
         max_input_length=None,
-        tokenizer_mode="auto",
+        tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
         trust_remote_code=False,
         revision=None,
     )
@@ -113,9 +132,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
-                                       tokenizer_name: str) -> List[int]:
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"]
+                                       tokenizer) -> List[int]:
+    complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
     return complete_sequence_token_ids
 
 
@@ -150,7 +168,7 @@ def create_dummy_prompt_logprobs(
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
 def test_decode_sequence_logprobs(complete_sequence: str,
                                   complete_sequence_token_ids: List[int],
                                   detokenizer: Detokenizer,
@@ -208,9 +226,13 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
 
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
-    tokenzier = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = tokenzier.decode(token_ids, skip_special_tokens=True)
-    text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True)
+    tokenizer = detokenizer.get_tokenizer_for_seq(seq)
+    text_full = (tokenizer.decode(token_ids) if isinstance(
+        tokenizer, MistralTokenizer) else tokenizer.decode(
+            token_ids, skip_special_tokens=True))
+    text_first = (tokenizer.decode(token_ids[0]) if isinstance(
+        tokenizer, MistralTokenizer) else tokenizer.decode(
+            token_ids[0], skip_special_tokens=True))
     text = text_full[len(text_first):]
 
     # Text for logprobs for the chosen token should be the same as the

From 0c16e3b3b77fffc32e9ffaa0336c17091a904efb Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta@us.ibm.com>
Date: Wed, 23 Oct 2024 13:50:43 -0700
Subject: [PATCH 05/10] =?UTF-8?q?=E2=9C=85=20add=20another=20mistral=20edg?=
 =?UTF-8?q?e=20case=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
---
 tests/tokenization/test_detokenize.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index efb366bf452d5..7f987a62c92fd 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -13,6 +13,7 @@
     "Hello here, this is a simple test",
     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
     "我很感谢你的热情",
+    # see https://github.com/vllm-project/vllm/pull/9625
     "THIS IS AN URGENCY",
 ]
 TOKENIZERS = [
@@ -59,6 +60,14 @@ def tokenizer(tokenizer_name):
             AutoTokenizer.from_pretrained(tokenizer_name))
 
 
+# see https://github.com/vllm-project/vllm/pull/9625
+@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
+def test_mistral_edge_case(tokenizer):
+    assert (_run_incremental_decode(tokenizer, [1492, 1176, 115679],
+                                    skip_special_tokens=True,
+                                    starting_index=0) == " ð")
+
+
 @pytest.fixture
 def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
     if "mistral" in tokenizer_name:

From 24c2f743780553c423398e360bfbdf129b0c84bd Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta@us.ibm.com>
Date: Wed, 23 Oct 2024 14:28:14 -0700
Subject: [PATCH 06/10] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20add=20skip=5Fspecial?=
 =?UTF-8?q?=5Ftokens=20in=20mistral=20decode=20to=20maintain=20consistency?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
---
 tests/tokenization/test_detokenize.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 7f987a62c92fd..446a801bdb311 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -89,9 +89,8 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
         generated_input_ids = truth_tokens[len(truth) // 2:]
         all_input_ids = prompt_input_ids + generated_input_ids
         starting_index = len(prompt_input_ids)
-        prompt = (tokenizer.decode(prompt_input_ids) if isinstance(
-            tokenizer, MistralTokenizer) else tokenizer.decode(
-                prompt_input_ids, skip_special_tokens=skip_special_tokens))
+        prompt = tokenizer.decode(prompt_input_ids,
+                                  skip_special_tokens=skip_special_tokens)
         generated = truth[len(prompt):]
     else:
         generated = truth
@@ -236,12 +235,8 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
     tokenizer = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = (tokenizer.decode(token_ids) if isinstance(
-        tokenizer, MistralTokenizer) else tokenizer.decode(
-            token_ids, skip_special_tokens=True))
-    text_first = (tokenizer.decode(token_ids[0]) if isinstance(
-        tokenizer, MistralTokenizer) else tokenizer.decode(
-            token_ids[0], skip_special_tokens=True))
+    text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
+    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
     text = text_full[len(text_first):]
 
     # Text for logprobs for the chosen token should be the same as the

From dcbc166e7062830ce14a7882695bf7bf637ab1da Mon Sep 17 00:00:00 2001
From: Travis Johnson <tjohnson31415@gmail.com>
Date: Wed, 30 Oct 2024 10:11:30 -0600
Subject: [PATCH 07/10] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index a911c2793ccf5..7c72c871264ed 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -82,6 +82,10 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
             raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
 
         self._vocab = tokenizer_.vocab()
+        # Convert to a Dict[str, int] to match protocol, but this is a lossy
+        # conversion. There may be multiple token ids that decode to the same
+        # string due to partial UTF-8 byte sequences being converted to �
+        self.vocab_dict = {token: idx for idx, token in enumerate(self._vocab)}
         self.tokenizer = tokenizer_
         self._max_token_id = max(self._vocab.values())
 
@@ -180,10 +184,7 @@ def __call__(
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
-        # Convert to a Dict[str, int] to match protocol, but this is a lossy
-        # conversion. There may be multiple token ids that decode to the same
-        # string due to partial UTF-8 byte sequences being converted to �
-        return {token: idx for idx, token in enumerate(self._vocab)}
+        return self.vocab_dict
 
     def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary

From 5c4c38109fdc26b1927e76eae4cde674de0cdec1 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 30 Oct 2024 15:43:34 -0600
Subject: [PATCH 08/10] refactor: add skip_special_tokens to
 MistralTokenizer.decode to align with API of other tokenizers

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 7c72c871264ed..e5192c3a214bf 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -244,7 +244,13 @@ def _token_to_id(t: str):
 
         return decoded
 
-    def decode(self, ids: Union[List[int], int]) -> str:
+    def decode(self,
+               ids: Union[List[int], int],
+               skip_special_tokens: bool = True) -> str:
+        assert (
+            skip_special_tokens
+        ), "Skipping special tokens is not supported for Mistral tokenizers."
+
         if isinstance(ids, int):
             ids = [ids]
         return self.tokenizer.decode(ids)

From 784aa727846bf8624f3566c74b8380c5c1f14ccc Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 30 Oct 2024 16:54:54 -0600
Subject: [PATCH 09/10] some changes for clarity

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 tests/tokenization/test_detokenize.py         | 32 +++++++++++++++----
 vllm/transformers_utils/tokenizers/mistral.py | 13 +++++---
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 446a801bdb311..1d07885349409 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -13,8 +13,11 @@
     "Hello here, this is a simple test",
     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa
     "我很感谢你的热情",
+    # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
+    # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
+    # incomplete UTF-8 characters
     # see https://github.com/vllm-project/vllm/pull/9625
-    "THIS IS AN URGENCY",
+    "ပုံပြင်လေးပြောပြပါ်",
 ]
 TOKENIZERS = [
     "facebook/opt-125m",
@@ -60,12 +63,29 @@ def tokenizer(tokenizer_name):
             AutoTokenizer.from_pretrained(tokenizer_name))
 
 
-# see https://github.com/vllm-project/vllm/pull/9625
 @pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
-def test_mistral_edge_case(tokenizer):
-    assert (_run_incremental_decode(tokenizer, [1492, 1176, 115679],
-                                    skip_special_tokens=True,
-                                    starting_index=0) == " ð")
+@pytest.mark.parametrize(
+    "truth",
+    [
+        # Burmese text triggers an edge-case where tokens may map to bytes with
+        # incomplete UTF-8 characters
+        "ပုံပြင်လေးပြောပြပါ",
+        # Using "URGENCY" since "CY" has token id 130282
+        "URGENCY🌶️",
+    ])
+def test_mistral_edge_case(tokenizer, truth):
+    """Test for a specific edge cases with V3-Tekken MistralTokenizer.
+
+    See https://github.com/vllm-project/vllm/pull/9625
+    """
+    starting_index = 0
+    all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
+
+    decoded_text = _run_incremental_decode(tokenizer,
+                                           all_input_ids,
+                                           skip_special_tokens=True,
+                                           starting_index=starting_index)
+    assert decoded_text == truth
 
 
 @pytest.fixture
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index e5192c3a214bf..b0514ddf1c384 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -85,7 +85,10 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
         # Convert to a Dict[str, int] to match protocol, but this is a lossy
         # conversion. There may be multiple token ids that decode to the same
         # string due to partial UTF-8 byte sequences being converted to �
-        self.vocab_dict = {token: idx for idx, token in enumerate(self._vocab)}
+        self._vocab_dict = {
+            token: idx
+            for idx, token in enumerate(self._vocab)
+        }
         self.tokenizer = tokenizer_
         self._max_token_id = max(self._vocab.values())
 
@@ -184,7 +187,9 @@ def __call__(
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
-        return self.vocab_dict
+        # NB: the dictionary form of the vocabulary collapses token ids that map
+        # to the same string but have different bytes
+        return self._vocab_dict
 
     def get_added_vocab(self) -> Dict[str, int]:
         # Mistral tokenizers have no added vocabulary
@@ -273,9 +278,9 @@ def convert_ids_to_tokens(
 
         if any("�" in t for t in tokens):
             # if a decoded token contains the replacement character, then the
-            # token has an incomplete UTF-8 character so we must use a byte
-            # string to avoid losing information
+            # token has an incomplete UTF-8 character so we must use bytes
             # See: https://github.com/vllm-project/vllm/pull/8640
+            #      https://github.com/vllm-project/vllm/pull/9625
             tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
 
         return tokens

From b7b21165e605023f76cb6591dc752e37be4cd09f Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 30 Oct 2024 23:01:08 -0600
Subject: [PATCH 10/10] fix: use vocab_size -1 to get max token id for
 MistralTokenizer

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index b0514ddf1c384..896f70bc1dafd 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -90,7 +90,7 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
             for idx, token in enumerate(self._vocab)
         }
         self.tokenizer = tokenizer_
-        self._max_token_id = max(self._vocab.values())
+        self._max_token_id = self.vocab_size - 1
 
     @classmethod
     def from_pretrained(cls,