Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added re compiled for performance improvements #129

Merged
merged 1 commit into from
Sep 15, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions symspellpy/symspellpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from symspellpy.verbosity import Verbosity

logger = logging.getLogger(__name__)
_rec = re.compile(r"(([^\W_]|['’])+)")


class SymSpell(PickleMixin):
Expand Down Expand Up @@ -183,10 +184,8 @@ def create_dictionary(
if not corpus.exists():
logger.error(f"Corpus not found at {corpus}.")
return False
with open(corpus, "r", encoding=encoding) as infile:
for line in infile:
for key in self._parse_words(line):
self.create_dictionary_entry(key, 1)
for key in self._parse_words(corpus.read_text(encoding=encoding)):
self.create_dictionary_entry(key, 1)
else:
for line in corpus:
for key in self._parse_words(line):
Expand Down Expand Up @@ -1042,9 +1041,8 @@ def _edits(
return delete_words
for i in range(current_distance, len(word)):
delete = word[:i] + word[i + 1 :]
if delete in delete_words:
continue
delete_words.add(delete)
if delete not in delete_words:
delete_words.add(delete)
# recursion, if maximum edit distance not yet reached
if edit_distance < self._max_dictionary_edit_distance:
self._edits(delete, edit_distance, delete_words, current_distance=i)
Expand Down Expand Up @@ -1140,7 +1138,7 @@ def _parse_words(text: str) -> List[str]:
# excluding "_". Compatible with non-latin characters, does not split
# words at apostrophes. Uses capturing groups to combine a negated set
# with a character set.
matches = re.findall(r"(([^\W_]|['’])+)", text.lower())
matches = _rec.findall(text.lower())
# The above regex returns ("ghi'jkl", "l") for "ghi'jkl", so we extract
# the first element
matches = [match[0] for match in matches]
Expand Down