Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable fancy regex #1586

Merged
merged 3 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ env_logger = "0.11"
pyo3 = { version = "0.21" }
numpy = "0.21"
ndarray = "0.15"
onig = { version = "6.4", default-features = false }
itertools = "0.12"

[dependencies.tokenizers]
Expand Down
18 changes: 12 additions & 6 deletions bindings/python/benches/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def format_byte_size(num_bytes: int) -> Tuple[str, str]:
return f"{num_bytes_f:.2f} PB", "PB"


def benchmark_batch(model: str, documents: list[str], num_threads: int) -> None:
def benchmark_batch(model: str, documents: list[str], num_threads: int, document_length: float) -> None:
os.environ["RAYON_NUM_THREADS"] = str(num_threads)
num_bytes = sum(map(len, map(str.encode, documents)))
readable_size, unit = format_byte_size(num_bytes)
print(f"==============")
print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)}")
print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}")
filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
mergeable_ranks = load_tiktoken_bpe(filename)
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
Expand Down Expand Up @@ -82,24 +82,30 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int) -> None:
def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
dataset_xnli = load_dataset(dataset, dataset_config)

input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
# input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
input_lengths = [(10_000, False, True), (10_000, False, False)]

for num_threads in threads:
for length, fuse in input_lengths:
for length, fuse, long in input_lengths:
documents = []
for i, item in enumerate(dataset_xnli["train"]):
if i >= length:
break
documents.append("".join(item["premise"].values()))
if long:
documents.append("".join(item["premise"].values()))
else:
documents.append(item["premise"]["en"])
if fuse:
documents=["".join(documents)]

document_length = sum(len(d) for d in documents) / len(documents)

# Rayon thread pool is global to a process, we need to launch
# separate processes in order to accurately use the correct number of threads.
# Otherwise, we're simply running tokenizers in whatever tests comes first.
# tokenizers does NOT provide a method to change the number of threads during
# runtime.
p = Process(target=benchmark_batch, args=(model, documents, num_threads))
p = Process(target=benchmark_batch, args=(model, documents, num_threads, document_length))
p.start()
p.join()

Expand Down
8 changes: 4 additions & 4 deletions bindings/python/src/utils/regex.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use onig::Regex;
use pyo3::exceptions;
use pyo3::prelude::*;
use tk::utils::SysRegex;

/// Instantiate a new Regex with the given pattern
#[pyclass(module = "tokenizers", name = "Regex")]
pub struct PyRegex {
pub inner: Regex,
pub inner: SysRegex,
pub pattern: String,
}

Expand All @@ -15,8 +15,8 @@ impl PyRegex {
#[pyo3(text_signature = "(self, pattern)")]
fn new(s: &str) -> PyResult<Self> {
Ok(Self {
inner: Regex::new(s)
.map_err(|e| exceptions::PyException::new_err(e.description().to_owned()))?,
inner: SysRegex::new(s)
.map_err(|e| exceptions::PyException::new_err(e.to_string().to_owned()))?,
pattern: s.to_owned(),
})
}
Expand Down
6 changes: 3 additions & 3 deletions tokenizers/src/models/bpe/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,14 +460,14 @@ impl BPE {
}

fn tokenize_with_cache(&self, sequence: &str) -> Result<Vec<Token>> {
if let Some(ref hit) = self.cache.as_ref().and_then(|c| c.get(sequence)) {
return Ok(self.word_to_tokens(hit).collect());
}
if self.ignore_merges {
if let Some(id) = self.vocab.get(sequence) {
return Ok(vec![Token::new(*id, sequence.to_string().clone(), (0, 0))]);
}
}
if let Some(ref hit) = self.cache.as_ref().and_then(|c| c.get(sequence)) {
return Ok(self.word_to_tokens(hit).collect());
}
let word = self.merge_word(sequence)?;
let ret = self.word_to_tokens(&word).collect();
if let Some(ref cache) = self.cache {
Expand Down
30 changes: 30 additions & 0 deletions tokenizers/src/utils/fancy.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use crate::tokenizer::pattern::Pattern;
use crate::Offsets;
use fancy_regex::Regex;
use std::error::Error;

Expand Down Expand Up @@ -31,3 +33,31 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
}
}
}

impl Pattern for &Regex {
fn find_matches(
&self,
inside: &str,
) -> Result<Vec<(Offsets, bool)>, Box<dyn Error + Send + Sync + 'static>> {
if inside.is_empty() {
return Ok(vec![((0, 0), false)]);
}

let mut prev = 0;
let mut splits = Vec::with_capacity(inside.len());
for match_ in self.find_iter(inside) {
let match_ = match_?;
let start = match_.start();
let end = match_.end();
if prev != start {
splits.push(((prev, start), false));
}
splits.push(((start, end), true));
prev = end;
}
if prev != inside.len() {
splits.push(((prev, inside.len()), false))
}
Ok(splits)
}
}
Comment on lines +37 to +63
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is copied from onig?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed.

Loading