From 60008cebfd74a0361216735fecaaa6ba83d17a82 Mon Sep 17 00:00:00 2001 From: Mddct Date: Thu, 30 Nov 2023 20:42:25 +0800 Subject: [PATCH 1/3] [text] huggingface tokenizer --- wenet/text/hugging_face_tokenizer.py | 65 ++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 wenet/text/hugging_face_tokenizer.py diff --git a/wenet/text/hugging_face_tokenizer.py b/wenet/text/hugging_face_tokenizer.py new file mode 100644 index 000000000..00a6b4372 --- /dev/null +++ b/wenet/text/hugging_face_tokenizer.py @@ -0,0 +1,65 @@ +from os import PathLike +from typing import Union +from wenet.text.base_tokenizer import BaseTokenizer + + +class HuggingFaceTokenizer(BaseTokenizer): + + def __init__(self, model: Union[str, PathLike]) -> None: + # NOTE(Mddct): don't build here, pickle issues + self.model = model + self.tokenizer = None + + def __getstate__(self): + state = self.__dict__.copy() + del state['tokenizer'] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + recovery = {'tokenizer': None} + self.__dict__.update(recovery) + + def _build_hugging_face(self): + from transformers import AutoTokenizer + if self.tokenizer is None: + self.tokenizer = AutoTokenizer.from_pretrained(self.model) + self.t2i = {} + self.i2t = {} + for i in range(self.tokenizer.encoding.n_vocab): + unit = str( + self.tokenizer.encoding.decode_single_token_bytes(i)) + if len(unit) == 0: + unit = str(i) + unit = unit.replace(" ", "") + # unit = bytes(unit, 'utf-8') + self.t2i[unit] = i + self.i2t[i] = unit + assert len(self.t2i) == len(self.i2t) + + def text2tokens(self, line: str) -> List[str]: + self._build_hugging_face() + return self.tokenizer.tokenize(line) + + def tokens2text(self, tokens: List[str]) -> str: + self._build_hugging_face() + ids = self.tokens2ids(tokens) + return self.tokenizer.decode(ids) + + def tokens2ids(self, tokens: List[str]) -> List[int]: + self._build_hugging_face() + return self.tokenizer.convert_tokens_to_ids(tokens) + + def ids2tokens(self, ids: List[int]) -> List[str]: + self._build_hugging_face() + return self.tokenizer.convert_ids_to_tokens(ids) + + def vocab_size(self) -> int: + self._build_hugging_face() + # TODO: we need special tokenize size in future + return len(self.tokenizer) + + @property + def symbol_table(self) -> Dict[str, int]: + self._build_tiktoken() + return self.t2i From 891f8fdcdd8a204887b0c5038fa78ad130eb1c1e Mon Sep 17 00:00:00 2001 From: Mddct Date: Thu, 30 Nov 2023 23:25:53 +0800 Subject: [PATCH 2/3] [text] unit pass --- .../wenet/text/test_hugging_face_tokenizer.py | 81 +++++++++++++++++++ test/wenet/text/test_parallel.py | 22 ++++- wenet/text/hugging_face_tokenizer.py | 17 ++-- 3 files changed, 106 insertions(+), 14 deletions(-) create mode 100644 test/wenet/text/test_hugging_face_tokenizer.py diff --git a/test/wenet/text/test_hugging_face_tokenizer.py b/test/wenet/text/test_hugging_face_tokenizer.py new file mode 100644 index 000000000..5a8d85465 --- /dev/null +++ b/test/wenet/text/test_hugging_face_tokenizer.py @@ -0,0 +1,81 @@ +import os +import pytest + +from wenet.text.hugging_face_tokenizer import HuggingFaceTokenizer + +try: + import transformers # noqa +except ImportError: + os.system('pip install --no-input transformers') + import transformers # noqa + + +@pytest.fixture(params=["bert-base-cased"]) +def hugging_face_tokenizer(request): + return HuggingFaceTokenizer(request.param) + + +def test_text2tokens(hugging_face_tokenizer: HuggingFaceTokenizer): + tokenizer = hugging_face_tokenizer + text = "hello wenet very cool!" + expected = ['hello', 'we', '##net', 'very', 'cool', '!'] + assert all(h == r for h, r in zip(tokenizer.text2tokens(text), expected)) + + +def test_tokens2text(hugging_face_tokenizer: HuggingFaceTokenizer): + tokenizer = hugging_face_tokenizer + inputs = ['hello', 'we', '##net', 'very', 'cool', '!'] + expected = "hello wenet very cool!" + + result = tokenizer.tokens2text(inputs) + assert result == expected + + +def test_tokens2ids(hugging_face_tokenizer: HuggingFaceTokenizer): + tokenizer = hugging_face_tokenizer + inputs = ['hello', 'we', '##net', 'very', 'cool', '!'] + expected = [19082, 1195, 6097, 1304, 4348, 106] + tokens = tokenizer.tokens2ids(inputs) + assert len(tokens) == len(expected) + assert all(h == r for (h, r) in zip(tokens, expected)) + + +def test_ids2tokens(hugging_face_tokenizer: HuggingFaceTokenizer): + tokenizer = hugging_face_tokenizer + ids = [19082, 1195, 6097, 1304, 4348, 106] + expected = ['hello', 'we', '##net', 'very', 'cool', '!'] + results = tokenizer.ids2tokens(ids) + assert len(results) == len(expected) + assert all(h == r for (h, r) in zip(results, expected)) + + +def test_tokenize(hugging_face_tokenizer: HuggingFaceTokenizer): + tokenizer = hugging_face_tokenizer + + text = "hello wenet very cool!" + ids = [19082, 1195, 6097, 1304, 4348, 106] + tokens = ['hello', 'we', '##net', 'very', 'cool', '!'] + + r_tokens, r_ids = tokenizer.tokenize(text) + assert len(r_tokens) == len(tokens) + assert all(h == r for (h, r) in zip(r_tokens, tokens)) + assert len(r_ids) == len(ids) + assert all(h == r for (h, r) in zip(r_ids, ids)) + + +def test_detokenize(hugging_face_tokenizer: HuggingFaceTokenizer): + tokenizer = hugging_face_tokenizer + text = "hello wenet very cool!" + ids = [19082, 1195, 6097, 1304, 4348, 106] + tokens = ['hello', 'we', '##net', 'very', 'cool', '!'] + + r_text, r_tokens = tokenizer.detokenize(ids) + assert r_text == text + assert len(r_tokens) == len(tokens) + assert all(h == r for (h, r) in zip(r_tokens, tokens)) + + +def test_vocab_size(hugging_face_tokenizer: HuggingFaceTokenizer): + assert hugging_face_tokenizer.vocab_size() == 28996 + assert hugging_face_tokenizer.vocab_size() == len( + hugging_face_tokenizer.symbol_table) diff --git a/test/wenet/text/test_parallel.py b/test/wenet/text/test_parallel.py index 54fe7a76e..7373968b8 100644 --- a/test/wenet/text/test_parallel.py +++ b/test/wenet/text/test_parallel.py @@ -3,6 +3,7 @@ from wenet.text.base_tokenizer import BaseTokenizer from wenet.text.bpe_tokenizer import BpeTokenizer +from wenet.text.hugging_face_tokenizer import HuggingFaceTokenizer from wenet.text.whisper_tokenizer import WhisperTokenizer @@ -47,7 +48,7 @@ def test_bpe_tokenzier_parallel(): symbol_table_path = "test/resources/librispeech.words.txt" bpe_model = "test/resources/librispeech.train_960_unigram5000.bpemodel" - inputs = ["WENR IS SIMPLE", "GOOD"] + inputs = ["WENT IS SIMPLE", "GOOD"] tokenizer = BpeTokenizer(bpe_model, symbol_table_path) partial_tokenize = partial(consistency, tokenizer) with Pool(processes=len(inputs)) as pool: @@ -63,7 +64,7 @@ def test_bpe_tokenizer_parallel_after_property(): symbol_table_path = "test/resources/librispeech.words.txt" bpe_model = "test/resources/librispeech.train_960_unigram5000.bpemodel" - inputs = ["WENR IS SIMPLE", "GOOD"] + inputs = ["WENT IS SIMPLE", "GOOD"] tokenizer = BpeTokenizer(bpe_model, symbol_table_path) _ = tokenizer.vocab_size _ = tokenizer.symbol_table @@ -76,3 +77,20 @@ def test_bpe_tokenizer_parallel_after_property(): results.sort() assert all(h == r for (h, r) in zip(results, inputs)) + + +def test_hugging_face_tokenizer(): + tokenizer = HuggingFaceTokenizer("bert-base-cased") + + _ = tokenizer.vocab_size + _ = tokenizer.symbol_table + + inputs = ["wenet is simple", "good"] + partial_tokenize = partial(consistency, tokenizer) + with Pool(processes=len(inputs)) as pool: + results = pool.map(partial_tokenize, inputs) + + inputs.sort() + results.sort() + + assert all(h == r for (h, r) in zip(results, inputs)) diff --git a/wenet/text/hugging_face_tokenizer.py b/wenet/text/hugging_face_tokenizer.py index 00a6b4372..8b9895372 100644 --- a/wenet/text/hugging_face_tokenizer.py +++ b/wenet/text/hugging_face_tokenizer.py @@ -1,5 +1,5 @@ from os import PathLike -from typing import Union +from typing import Dict, List, Union from wenet.text.base_tokenizer import BaseTokenizer @@ -24,17 +24,10 @@ def _build_hugging_face(self): from transformers import AutoTokenizer if self.tokenizer is None: self.tokenizer = AutoTokenizer.from_pretrained(self.model) - self.t2i = {} + self.t2i = self.tokenizer.vocab self.i2t = {} - for i in range(self.tokenizer.encoding.n_vocab): - unit = str( - self.tokenizer.encoding.decode_single_token_bytes(i)) - if len(unit) == 0: - unit = str(i) - unit = unit.replace(" ", "") - # unit = bytes(unit, 'utf-8') - self.t2i[unit] = i - self.i2t[i] = unit + for (i, token) in self.t2i.items(): + self.i2t[i] = token assert len(self.t2i) == len(self.i2t) def text2tokens(self, line: str) -> List[str]: @@ -61,5 +54,5 @@ def vocab_size(self) -> int: @property def symbol_table(self) -> Dict[str, int]: - self._build_tiktoken() + self._build_hugging_face() return self.t2i From 77a265fc2393a0c0723812bf791131b26ff4f85f Mon Sep 17 00:00:00 2001 From: Mddct Date: Fri, 1 Dec 2023 15:48:37 +0800 Subject: [PATCH 3/3] [text] add tongyi unit test && change token type to T --- .../wenet/text/test_hugging_face_tokenizer.py | 15 ++++++++++ wenet/text/base_tokenizer.py | 18 ++++++------ wenet/text/hugging_face_tokenizer.py | 28 +++++++++---------- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/test/wenet/text/test_hugging_face_tokenizer.py b/test/wenet/text/test_hugging_face_tokenizer.py index 5a8d85465..24caadbb8 100644 --- a/test/wenet/text/test_hugging_face_tokenizer.py +++ b/test/wenet/text/test_hugging_face_tokenizer.py @@ -79,3 +79,18 @@ def test_vocab_size(hugging_face_tokenizer: HuggingFaceTokenizer): assert hugging_face_tokenizer.vocab_size() == 28996 assert hugging_face_tokenizer.vocab_size() == len( hugging_face_tokenizer.symbol_table) + + +def test_tongyi_tokenizer(): + # NOTE(Mddct): tongyi need extra matplotlib package + os.system('pip install --no-input matplotlib') + model_dir = 'Qwen/Qwen-Audio-Chat' + tongyi_tokenizer = transformers.AutoTokenizer.from_pretrained( + model_dir, trust_remote_code=True) + tokenizer = HuggingFaceTokenizer(model_dir, trust_remote_code=True) + text = "from transformers import AutoModelForCausalLM, AutoTokenizer" + tongyi_result = tongyi_tokenizer.tokenize(text) + result, _ = tokenizer.tokenize(text) + + assert len(result) == len(tongyi_result) + assert all(h == r for (h, r) in zip(result, tongyi_result)) diff --git a/wenet/text/base_tokenizer.py b/wenet/text/base_tokenizer.py index c96993309..2e7731fa1 100644 --- a/wenet/text/base_tokenizer.py +++ b/wenet/text/base_tokenizer.py @@ -1,33 +1,35 @@ from abc import ABC, abstractmethod, abstractproperty -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Union + +T = Union[str, bytes] class BaseTokenizer(ABC): - def tokenize(self, line: str) -> Tuple[List[str], List[int]]: + def tokenize(self, line: str) -> Tuple[List[T], List[int]]: tokens = self.text2tokens(line) ids = self.tokens2ids(tokens) return tokens, ids - def detokenize(self, ids: List[int]) -> Tuple[str, List[str]]: + def detokenize(self, ids: List[int]) -> Tuple[str, List[T]]: tokens = self.ids2tokens(ids) text = self.tokens2text(tokens) return text, tokens @abstractmethod - def text2tokens(self, line: str) -> List[str]: + def text2tokens(self, line: str) -> List[T]: raise NotImplementedError("abstract method") @abstractmethod - def tokens2text(self, tokens: List[str]) -> str: + def tokens2text(self, tokens: List[T]) -> str: raise NotImplementedError("abstract method") @abstractmethod - def tokens2ids(self, tokens: List[str]) -> List[int]: + def tokens2ids(self, tokens: List[T]) -> List[int]: raise NotImplementedError("abstract method") @abstractmethod - def ids2tokens(self, ids: List[int]) -> List[str]: + def ids2tokens(self, ids: List[int]) -> List[T]: raise NotImplementedError("abstract method") @abstractmethod @@ -35,5 +37,5 @@ def vocab_size(self) -> int: raise NotImplementedError("abstract method") @abstractproperty - def symbol_table(self) -> Dict[str, int]: + def symbol_table(self) -> Dict[T, int]: raise NotImplementedError("abstract method") diff --git a/wenet/text/hugging_face_tokenizer.py b/wenet/text/hugging_face_tokenizer.py index 8b9895372..7ea6f0529 100644 --- a/wenet/text/hugging_face_tokenizer.py +++ b/wenet/text/hugging_face_tokenizer.py @@ -1,15 +1,18 @@ from os import PathLike from typing import Dict, List, Union -from wenet.text.base_tokenizer import BaseTokenizer +from wenet.text.base_tokenizer import BaseTokenizer, T as Type class HuggingFaceTokenizer(BaseTokenizer): - def __init__(self, model: Union[str, PathLike]) -> None: + def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None: # NOTE(Mddct): don't build here, pickle issues self.model = model self.tokenizer = None + self.args = args + self.kwargs = kwargs + def __getstate__(self): state = self.__dict__.copy() del state['tokenizer'] @@ -23,27 +26,24 @@ def __setstate__(self, state): def _build_hugging_face(self): from transformers import AutoTokenizer if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model) - self.t2i = self.tokenizer.vocab - self.i2t = {} - for (i, token) in self.t2i.items(): - self.i2t[i] = token - assert len(self.t2i) == len(self.i2t) - - def text2tokens(self, line: str) -> List[str]: + self.tokenizer = AutoTokenizer.from_pretrained( + self.model, **self.kwargs) + self.t2i = self.tokenizer.get_vocab() + + def text2tokens(self, line: str) -> List[Type]: self._build_hugging_face() return self.tokenizer.tokenize(line) - def tokens2text(self, tokens: List[str]) -> str: + def tokens2text(self, tokens: List[Type]) -> str: self._build_hugging_face() ids = self.tokens2ids(tokens) return self.tokenizer.decode(ids) - def tokens2ids(self, tokens: List[str]) -> List[int]: + def tokens2ids(self, tokens: List[Type]) -> List[int]: self._build_hugging_face() return self.tokenizer.convert_tokens_to_ids(tokens) - def ids2tokens(self, ids: List[int]) -> List[str]: + def ids2tokens(self, ids: List[int]) -> List[Type]: self._build_hugging_face() return self.tokenizer.convert_ids_to_tokens(ids) @@ -53,6 +53,6 @@ def vocab_size(self) -> int: return len(self.tokenizer) @property - def symbol_table(self) -> Dict[str, int]: + def symbol_table(self) -> Dict[Type, int]: self._build_hugging_face() return self.t2i