Skip to content

Commit

Permalink
fix unitest
Browse files Browse the repository at this point in the history
  • Loading branch information
00INDEX committed Mar 18, 2024
1 parent 898582e commit c013dc3
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 275 deletions.
26 changes: 20 additions & 6 deletions parallel_tokenizer/parallel_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) OpenLMLab. All rights reserved.
import multiprocessing as mp
import os
import time
from functools import partial, reduce
from typing import Any, Callable, List, Sequence, Tuple, Union
Expand All @@ -14,6 +15,8 @@

logger = get_logger(__name__)

os.environ["TOKENIZERS_PARALLELISM"] = "false"


class ParallelTokenizer:
"""
Expand Down Expand Up @@ -73,6 +76,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
partial(ParallelTokenizer.encode_handler, tokenizer=_tokenizer),
chunks(text, self.chunk_size, self.overlap_length),
)

if len(shards) == 1:
return shards[0]

if isinstance(shards[0], (dict, BatchEncoding)):
tokens_shards = [flatten(shard["input_ids"]) for shard in shards]
else:
Expand All @@ -92,6 +99,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
result = merge(shards, matches)
return result

def __del__(self):
self.pool.close()
self.pool.join()

def __getattr__(self, __name: str) -> Any:
"""
Allows direct access to the tokenizer's attributes.
Expand All @@ -104,7 +115,7 @@ def __getattr__(self, __name: str) -> Any:
"""
return getattr(self.tokenizer, __name)

def benchmark(self, *args: Any, **kwargs: Any) -> float:
def benchmark(self, *args: Any, return_acc: bool = True, **kwargs: Any) -> float:
"""
Tests the efficiency and accuracy of the parallel tokenization process compared to the sequential process.
Expand All @@ -130,13 +141,16 @@ def benchmark(self, *args: Any, **kwargs: Any) -> float:
raw_tokens = to_list(flatten(raw_result))
parallel_tokens = to_list(flatten(parallel_result))

acc = [raw_tokens[i] - parallel_tokens[i] for i in range(min(len(raw_tokens), len(parallel_tokens)))].count(
0
) / min(len(raw_tokens), len(parallel_tokens))
if return_acc:
acc = [raw_tokens[i] - parallel_tokens[i] for i in range(min(len(raw_tokens), len(parallel_tokens)))].count(
0
) / min(len(raw_tokens), len(parallel_tokens))

logger.info(f"raw_time: {raw_time:.4f} - parallel_time: {parallel_time:.4f} - acc: {acc:.4f}")
logger.info(f"raw_time: {raw_time:.4f} - parallel_time: {parallel_time:.4f} - acc: {acc:.4f}")

return acc
return raw_time, parallel_time, acc
else:
logger.info(f"raw_time: {raw_time:.4f} - parallel_time: {parallel_time:.4f}")

@staticmethod
def encode_handler(
Expand Down
6 changes: 0 additions & 6 deletions tests/assets/special_tokens_map.json

This file was deleted.

240 changes: 0 additions & 240 deletions tests/assets/tokenization_internlm.py

This file was deleted.

Binary file removed tests/assets/tokenizer.model
Binary file not shown.
15 changes: 0 additions & 15 deletions tests/assets/tokenizer_config.json

This file was deleted.

24 changes: 16 additions & 8 deletions tests/test_parallel_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import os
from tempfile import TemporaryDirectory

import pytest

from parallel_tokenizer.parallel_tokenizer import ParallelTokenizer
from tests.utils import download_file


@pytest.mark.parametrize("sentence_length", [81920, 163840])
Expand All @@ -15,21 +19,25 @@ def test_sp_tokenizer_in_parallel(sentence_length: int):
random.seed(1024)
r = RandomWord()

tokenizer = SentencePieceTokenizer("./tests/assets/tokenizer.model")
with TemporaryDirectory() as tmp_dir:
model_path = os.path.join(tmp_dir, "tokenizer.model")
download_file("https://huggingface.co/internlm/internlm2-20b/resolve/main/tokenizer.model", model_path)
tokenizer = SentencePieceTokenizer(model_path)

parallel_tokenizer = ParallelTokenizer(
tokenizer=tokenizer,
parallel_degree=4,
num_processes=4,
chunk_size=40960,
overlap_length=512,
concat_keys=["input_ids", "attention_mask"],
)
sentence: str = " ".join([r.word() for _ in range(sentence_length)])
acc: float = parallel_tokenizer.test(sentence)
_, _, acc = parallel_tokenizer.benchmark(sentence)
assert math.isclose(acc, 1.0, abs_tol=1e-5)


@pytest.mark.parametrize("sentence_length", [81920, 163840])
@pytest.mark.parametrize("return_tensors", [None, "tp", "np"])
@pytest.mark.parametrize("sentence_length", [81920])
@pytest.mark.parametrize("return_tensors", [None, "pt", "np"])
def test_hf_tokenizer_in_parallel(sentence_length: int, return_tensors: str):
import math
import random
Expand All @@ -40,14 +48,14 @@ def test_hf_tokenizer_in_parallel(sentence_length: int, return_tensors: str):
random.seed(1024)
r = RandomWord()

tokenizer = AutoTokenizer.from_pretrained("./tests/assets/", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-20b", trust_remote_code=True)
parallel_tokenizer = ParallelTokenizer(
tokenizer=tokenizer,
parallel_degree=4,
num_processes=4,
chunk_size=40960,
overlap_length=512,
concat_keys=["input_ids", "attention_mask"],
)
sentence: str = " ".join([r.word() for _ in range(sentence_length)])
acc: float = parallel_tokenizer.test(sentence, return_tensors=return_tensors)
_, _, acc = parallel_tokenizer.benchmark(sentence, return_tensors=return_tensors)
assert math.isclose(acc, 1.0, abs_tol=1e-5)
Loading

0 comments on commit c013dc3

Please sign in to comment.