Skip to content

Commit

Permalink
Enabling the option to use fancy_regex instead of onig.
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Aug 1, 2024
1 parent 9e0c791 commit 7415e28
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 5 deletions.
1 change: 0 additions & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ env_logger = "0.11"
pyo3 = { version = "0.21" }
numpy = "0.21"
ndarray = "0.15"
onig = { version = "6.4", default-features = false }
itertools = "0.12"

[dependencies.tokenizers]
Expand Down
8 changes: 4 additions & 4 deletions bindings/python/src/utils/regex.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use onig::Regex;
use pyo3::exceptions;
use pyo3::prelude::*;
use tk::utils::SysRegex;

/// Instantiate a new Regex with the given pattern
#[pyclass(module = "tokenizers", name = "Regex")]
pub struct PyRegex {
pub inner: Regex,
pub inner: SysRegex,
pub pattern: String,
}

Expand All @@ -15,8 +15,8 @@ impl PyRegex {
#[pyo3(text_signature = "(self, pattern)")]
fn new(s: &str) -> PyResult<Self> {
Ok(Self {
inner: Regex::new(s)
.map_err(|e| exceptions::PyException::new_err(e.description().to_owned()))?,
inner: SysRegex::new(s)
.map_err(|e| exceptions::PyException::new_err(e.to_string().to_owned()))?,
pattern: s.to_owned(),
})
}
Expand Down
30 changes: 30 additions & 0 deletions tokenizers/src/utils/fancy.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use crate::tokenizer::pattern::Pattern;
use crate::Offsets;
use fancy_regex::Regex;
use std::error::Error;

Expand Down Expand Up @@ -31,3 +33,31 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
}
}
}

impl Pattern for &Regex {
fn find_matches(
&self,
inside: &str,
) -> Result<Vec<(Offsets, bool)>, Box<dyn Error + Send + Sync + 'static>> {
if inside.is_empty() {
return Ok(vec![((0, 0), false)]);
}

let mut prev = 0;
let mut splits = Vec::with_capacity(inside.len());
for match_ in self.find_iter(inside) {
let match_ = match_?;
let start = match_.start();
let end = match_.end();
if prev != start {
splits.push(((prev, start), false));
}
splits.push(((start, end), true));
prev = end;
}
if prev != inside.len() {
splits.push(((prev, inside.len()), false))
}
Ok(splits)
}
}

0 comments on commit 7415e28

Please sign in to comment.