Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing some performance bottlenecks #183

Merged
merged 14 commits into from
May 3, 2022
17 changes: 7 additions & 10 deletions charset_normalizer/cd.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,10 @@ def characters_popularity_compare(
raise ValueError("{} not available".format(language))

character_approved_count = 0 # type: int
FREQUENCIES_language_set = set(FREQUENCIES[language])

for character in ordered_characters:
if character not in FREQUENCIES[language]:
if character not in FREQUENCIES_language_set:
continue

characters_before_source = FREQUENCIES[language][
Expand All @@ -186,23 +187,19 @@ def characters_popularity_compare(
characters_after_source = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
] # type: List[str]

characters_before = ordered_characters[
0 : ordered_characters.index(character)
] # type: List[str]
characters_after = ordered_characters[
ordered_characters.index(character) :
] # type: List[str]

before_match_count = [
e in characters_before for e in characters_before_source
].count(
True
before_match_count = len(
set(characters_before) & set(characters_before_source)
) # type: int
after_match_count = [
e in characters_after for e in characters_after_source
].count(
True

after_match_count = len(
set(characters_after) & set(characters_after_source)
) # type: int

if len(characters_before_source) == 0 and before_match_count <= 4:
Expand Down
8 changes: 3 additions & 5 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
Expand Down Expand Up @@ -139,11 +140,7 @@ def eligible(self, character: str) -> bool:
return True

def feed(self, character: str) -> None:
if (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1

Expand Down Expand Up @@ -446,6 +443,7 @@ def ratio(self) -> float:
return self._successive_upper_lower_count_final / self._character_count


@lru_cache(maxsize=1024)
Ousret marked this conversation as resolved.
Show resolved Hide resolved
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
Expand Down
12 changes: 12 additions & 0 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def is_latin(character: str) -> bool:
return "LATIN" in description


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Ousret marked this conversation as resolved.
Show resolved Hide resolved
def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
Expand Down Expand Up @@ -197,6 +198,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
if (
deedy5 marked this conversation as resolved.
Show resolved Hide resolved
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
return True
return False


def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
Expand Down