From d09fe48494d01e6699a4efafcc4a2893c7f9d24d Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 23 Oct 2024 01:01:37 -0600 Subject: [PATCH 01/10] fix mistral vocab usage Signed-off-by: Travis Johnson --- vllm/transformers_utils/tokenizers/mistral.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 80e21c2d32ecc..9987c3529345a 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -72,18 +72,12 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None: # Make sure special tokens will not raise tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE - self._vocab = { - token: idx - for idx, token in enumerate(tokenizer_.vocab()) - } elif isinstance(tokenizer_, SentencePieceTokenizer): - self._vocab = { - token: idx - for idx, token in enumerate(tokenizer_.vocab()) - } + pass else: raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}") + self._vocab = tokenizer_.vocab() self.tokenizer = tokenizer_ self._max_token_id = max(self._vocab.values()) @@ -182,7 +176,10 @@ def __call__( return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: - return self._vocab + # Convert to a Dict[str, int] to match protocol, but this is a lossy + # conversion. There may be multiple token ids that decode to the same + # string due to partial UTF-8 byte sequences being converted to � + return {token: idx for idx, token in enumerate(self._vocab)} def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary From 986fbd9c38f7cb18a994bddd35d7d178fff13693 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 23 Oct 2024 01:04:59 -0600 Subject: [PATCH 02/10] fix: convert any token with an incomplete character to bytes Signed-off-by: Travis Johnson --- vllm/transformers_utils/tokenizers/mistral.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 9987c3529345a..9c33191e5d338 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -254,9 +254,10 @@ def convert_ids_to_tokens( tokens = [self.tokenizer.id_to_piece(id) for id in ids] - if any(t.strip() == "�" for t in tokens): - # if any stripped decoded token is undefined - # because it's invalid unicode then pass bytes + if any("�" in t for t in tokens): + # if a decoded token contains the replacement character, then the + # token has an incomplete UTF-8 character so we must use a byte + # string to avoid losing information # See: https://github.com/vllm-project/vllm/pull/8640 tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids] From 4c3f0143c41bfdc6cb752ef9447b4fc22c4b3c16 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 23 Oct 2024 10:18:16 -0600 Subject: [PATCH 03/10] catch and log about invalid tokens in convert_tokens_to_string Signed-off-by: Travis Johnson --- vllm/transformers_utils/tokenizers/mistral.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 9c33191e5d338..a911c2793ccf5 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -16,9 +16,13 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy, Tekkenizer) +from vllm.logger import init_logger + if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +logger = init_logger(__name__) + @dataclass class Encoding: @@ -217,14 +221,20 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str: if any(isinstance(t, bytes) for t in tokens): # we need to encode and decode all tokens again shift = self.tokenizer.num_special_tokens - byte_tokens = [ - t.encode("utf-8") if not isinstance(t, bytes) else t - for t in tokens - ] - ids = [ - self.tokenizer._tekken_token2id_nospecial[t] + shift - for t in byte_tokens - ] + + def _token_to_id(t: str): + t_bytes = t.encode("utf-8") \ + if not isinstance(t, bytes) else t + try: + return shift + \ + self.tokenizer._tekken_token2id_nospecial[t_bytes] + except KeyError: + logger.warning( + "Failed to convert token %s to id," + " replacing with ", t_bytes) + return self.tokenizer.unk_id + + ids = [_token_to_id(t) for t in tokens] decoded = self.tokenizer.decode(ids) else: decoded = "".join(tokens) From 888be8205cd90259e6aa6f6278ab91e4abcef36b Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Wed, 23 Oct 2024 13:39:01 -0700 Subject: [PATCH 04/10] =?UTF-8?q?=E2=9C=85=20add=20mistral=20model=20to=20?= =?UTF-8?q?test=5Fdetokenize?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Prashant Gupta --- tests/tokenization/test_detokenize.py | 60 ++++++++++++++++++--------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index f4551ed42efb8..efb366bf452d5 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Generator, List, Optional import pytest from transformers import AutoTokenizer @@ -7,11 +7,13 @@ from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) from vllm.transformers_utils.tokenizer_group import get_tokenizer_group +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer TRUTH = [ "Hello here, this is a simple test", "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa - "我很感谢你的热情" + "我很感谢你的热情", + "THIS IS AN URGENCY", ] TOKENIZERS = [ "facebook/opt-125m", @@ -24,6 +26,7 @@ "tiiuae/falcon-7b", "meta-llama/Llama-2-7b-hf", "codellama/CodeLlama-7b-hf", + "mistralai/Pixtral-12B-2409", ] @@ -49,26 +52,42 @@ def _run_incremental_decode(tokenizer, all_input_ids, return decoded_text +@pytest.fixture +def tokenizer(tokenizer_name): + return (MistralTokenizer.from_pretrained(tokenizer_name) + if "mistral" in tokenizer_name else + AutoTokenizer.from_pretrained(tokenizer_name)) + + +@pytest.fixture +def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]: + if "mistral" in tokenizer_name: + yield ( + bool(True) if request.param else + pytest.skip("mistral doesn't support skip_special_tokens=False")) + else: + yield bool(True) if request.param else bool(False) + + @pytest.mark.parametrize("truth", TRUTH) @pytest.mark.parametrize("with_prompt", [True, False]) -@pytest.mark.parametrize("tokenizer_id", TOKENIZERS) -@pytest.mark.parametrize("skip_special_tokens", (True, False)) -def test_decode_streaming(tokenizer_id, truth, with_prompt, - skip_special_tokens): - tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) +@pytest.mark.parametrize("tokenizer_name", TOKENIZERS) +@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True) +def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens): if with_prompt: - truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"] + truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids prompt_input_ids = truth_tokens[:len(truth) // 2] generated_input_ids = truth_tokens[len(truth) // 2:] all_input_ids = prompt_input_ids + generated_input_ids starting_index = len(prompt_input_ids) - prompt = tokenizer.decode(prompt_input_ids, - skip_special_tokens=skip_special_tokens) + prompt = (tokenizer.decode(prompt_input_ids) if isinstance( + tokenizer, MistralTokenizer) else tokenizer.decode( + prompt_input_ids, skip_special_tokens=skip_special_tokens)) generated = truth[len(prompt):] else: generated = truth starting_index = 0 - all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"] + all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids if skip_special_tokens: if tokenizer.bos_token_id is not None: all_input_ids = [tokenizer.bos_token_id] + all_input_ids @@ -98,7 +117,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: enable_lora=False, max_num_seqs=100, max_input_length=None, - tokenizer_mode="auto", + tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto", trust_remote_code=False, revision=None, ) @@ -113,9 +132,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: @pytest.fixture(name="complete_sequence_token_ids") def create_complete_sequence_token_ids(complete_sequence: str, - tokenizer_name: str) -> List[int]: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"] + tokenizer) -> List[int]: + complete_sequence_token_ids = tokenizer(complete_sequence).input_ids return complete_sequence_token_ids @@ -150,7 +168,7 @@ def create_dummy_prompt_logprobs( @pytest.mark.parametrize("complete_sequence", TRUTH) @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -@pytest.mark.parametrize("skip_special_tokens", [True, False]) +@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True) def test_decode_sequence_logprobs(complete_sequence: str, complete_sequence_token_ids: List[int], detokenizer: Detokenizer, @@ -208,9 +226,13 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], # decoded_prompt_logprobs doesn't contain the first token. token_ids = complete_sequence_token_ids - tokenzier = detokenizer.get_tokenizer_for_seq(seq) - text_full = tokenzier.decode(token_ids, skip_special_tokens=True) - text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True) + tokenizer = detokenizer.get_tokenizer_for_seq(seq) + text_full = (tokenizer.decode(token_ids) if isinstance( + tokenizer, MistralTokenizer) else tokenizer.decode( + token_ids, skip_special_tokens=True)) + text_first = (tokenizer.decode(token_ids[0]) if isinstance( + tokenizer, MistralTokenizer) else tokenizer.decode( + token_ids[0], skip_special_tokens=True)) text = text_full[len(text_first):] # Text for logprobs for the chosen token should be the same as the From 0c16e3b3b77fffc32e9ffaa0336c17091a904efb Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Wed, 23 Oct 2024 13:50:43 -0700 Subject: [PATCH 05/10] =?UTF-8?q?=E2=9C=85=20add=20another=20mistral=20edg?= =?UTF-8?q?e=20case=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Prashant Gupta --- tests/tokenization/test_detokenize.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index efb366bf452d5..7f987a62c92fd 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -13,6 +13,7 @@ "Hello here, this is a simple test", "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa "我很感谢你的热情", + # see https://github.com/vllm-project/vllm/pull/9625 "THIS IS AN URGENCY", ] TOKENIZERS = [ @@ -59,6 +60,14 @@ def tokenizer(tokenizer_name): AutoTokenizer.from_pretrained(tokenizer_name)) +# see https://github.com/vllm-project/vllm/pull/9625 +@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"]) +def test_mistral_edge_case(tokenizer): + assert (_run_incremental_decode(tokenizer, [1492, 1176, 115679], + skip_special_tokens=True, + starting_index=0) == " ð") + + @pytest.fixture def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]: if "mistral" in tokenizer_name: From 24c2f743780553c423398e360bfbdf129b0c84bd Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Wed, 23 Oct 2024 14:28:14 -0700 Subject: [PATCH 06/10] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20add=20skip=5Fspecial?= =?UTF-8?q?=5Ftokens=20in=20mistral=20decode=20to=20maintain=20consistency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Prashant Gupta --- tests/tokenization/test_detokenize.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 7f987a62c92fd..446a801bdb311 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -89,9 +89,8 @@ def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens): generated_input_ids = truth_tokens[len(truth) // 2:] all_input_ids = prompt_input_ids + generated_input_ids starting_index = len(prompt_input_ids) - prompt = (tokenizer.decode(prompt_input_ids) if isinstance( - tokenizer, MistralTokenizer) else tokenizer.decode( - prompt_input_ids, skip_special_tokens=skip_special_tokens)) + prompt = tokenizer.decode(prompt_input_ids, + skip_special_tokens=skip_special_tokens) generated = truth[len(prompt):] else: generated = truth @@ -236,12 +235,8 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], # decoded_prompt_logprobs doesn't contain the first token. token_ids = complete_sequence_token_ids tokenizer = detokenizer.get_tokenizer_for_seq(seq) - text_full = (tokenizer.decode(token_ids) if isinstance( - tokenizer, MistralTokenizer) else tokenizer.decode( - token_ids, skip_special_tokens=True)) - text_first = (tokenizer.decode(token_ids[0]) if isinstance( - tokenizer, MistralTokenizer) else tokenizer.decode( - token_ids[0], skip_special_tokens=True)) + text_full = tokenizer.decode(token_ids, skip_special_tokens=True) + text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True) text = text_full[len(text_first):] # Text for logprobs for the chosen token should be the same as the From dcbc166e7062830ce14a7882695bf7bf637ab1da Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 30 Oct 2024 10:11:30 -0600 Subject: [PATCH 07/10] Apply suggestions from code review Co-authored-by: Patrick von Platen Signed-off-by: Travis Johnson --- vllm/transformers_utils/tokenizers/mistral.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index a911c2793ccf5..7c72c871264ed 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -82,6 +82,10 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None: raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}") self._vocab = tokenizer_.vocab() + # Convert to a Dict[str, int] to match protocol, but this is a lossy + # conversion. There may be multiple token ids that decode to the same + # string due to partial UTF-8 byte sequences being converted to � + self.vocab_dict = {token: idx for idx, token in enumerate(self._vocab)} self.tokenizer = tokenizer_ self._max_token_id = max(self._vocab.values()) @@ -180,10 +184,7 @@ def __call__( return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: - # Convert to a Dict[str, int] to match protocol, but this is a lossy - # conversion. There may be multiple token ids that decode to the same - # string due to partial UTF-8 byte sequences being converted to � - return {token: idx for idx, token in enumerate(self._vocab)} + return self.vocab_dict def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary From 5c4c38109fdc26b1927e76eae4cde674de0cdec1 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 30 Oct 2024 15:43:34 -0600 Subject: [PATCH 08/10] refactor: add skip_special_tokens to MistralTokenizer.decode to align with API of other tokenizers Signed-off-by: Travis Johnson --- vllm/transformers_utils/tokenizers/mistral.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 7c72c871264ed..e5192c3a214bf 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -244,7 +244,13 @@ def _token_to_id(t: str): return decoded - def decode(self, ids: Union[List[int], int]) -> str: + def decode(self, + ids: Union[List[int], int], + skip_special_tokens: bool = True) -> str: + assert ( + skip_special_tokens + ), "Skipping special tokens is not supported for Mistral tokenizers." + if isinstance(ids, int): ids = [ids] return self.tokenizer.decode(ids) From 784aa727846bf8624f3566c74b8380c5c1f14ccc Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 30 Oct 2024 16:54:54 -0600 Subject: [PATCH 09/10] some changes for clarity Signed-off-by: Travis Johnson --- tests/tokenization/test_detokenize.py | 32 +++++++++++++++---- vllm/transformers_utils/tokenizers/mistral.py | 13 +++++--- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 446a801bdb311..1d07885349409 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -13,8 +13,11 @@ "Hello here, this is a simple test", "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa "我很感谢你的热情", + # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg. + # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with + # incomplete UTF-8 characters # see https://github.com/vllm-project/vllm/pull/9625 - "THIS IS AN URGENCY", + "ပုံပြင်လေးပြောပြပါ်", ] TOKENIZERS = [ "facebook/opt-125m", @@ -60,12 +63,29 @@ def tokenizer(tokenizer_name): AutoTokenizer.from_pretrained(tokenizer_name)) -# see https://github.com/vllm-project/vllm/pull/9625 @pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"]) -def test_mistral_edge_case(tokenizer): - assert (_run_incremental_decode(tokenizer, [1492, 1176, 115679], - skip_special_tokens=True, - starting_index=0) == " ð") +@pytest.mark.parametrize( + "truth", + [ + # Burmese text triggers an edge-case where tokens may map to bytes with + # incomplete UTF-8 characters + "ပုံပြင်လေးပြောပြပါ", + # Using "URGENCY" since "CY" has token id 130282 + "URGENCY🌶️", + ]) +def test_mistral_edge_case(tokenizer, truth): + """Test for a specific edge cases with V3-Tekken MistralTokenizer. + + See https://github.com/vllm-project/vllm/pull/9625 + """ + starting_index = 0 + all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids + + decoded_text = _run_incremental_decode(tokenizer, + all_input_ids, + skip_special_tokens=True, + starting_index=starting_index) + assert decoded_text == truth @pytest.fixture diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index e5192c3a214bf..b0514ddf1c384 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -85,7 +85,10 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None: # Convert to a Dict[str, int] to match protocol, but this is a lossy # conversion. There may be multiple token ids that decode to the same # string due to partial UTF-8 byte sequences being converted to � - self.vocab_dict = {token: idx for idx, token in enumerate(self._vocab)} + self._vocab_dict = { + token: idx + for idx, token in enumerate(self._vocab) + } self.tokenizer = tokenizer_ self._max_token_id = max(self._vocab.values()) @@ -184,7 +187,9 @@ def __call__( return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: - return self.vocab_dict + # NB: the dictionary form of the vocabulary collapses token ids that map + # to the same string but have different bytes + return self._vocab_dict def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary @@ -273,9 +278,9 @@ def convert_ids_to_tokens( if any("�" in t for t in tokens): # if a decoded token contains the replacement character, then the - # token has an incomplete UTF-8 character so we must use a byte - # string to avoid losing information + # token has an incomplete UTF-8 character so we must use bytes # See: https://github.com/vllm-project/vllm/pull/8640 + # https://github.com/vllm-project/vllm/pull/9625 tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids] return tokens From b7b21165e605023f76cb6591dc752e37be4cd09f Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 30 Oct 2024 23:01:08 -0600 Subject: [PATCH 10/10] fix: use vocab_size -1 to get max token id for MistralTokenizer Signed-off-by: Travis Johnson --- vllm/transformers_utils/tokenizers/mistral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index b0514ddf1c384..896f70bc1dafd 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -90,7 +90,7 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None: for idx, token in enumerate(self._vocab) } self.tokenizer = tokenizer_ - self._max_token_id = max(self._vocab.values()) + self._max_token_id = self.vocab_size - 1 @classmethod def from_pretrained(cls,