Skip to content

Commit

Permalink
✨ Minor improvements for the MD (#101)
Browse files Browse the repository at this point in the history
* ❇️ Ignore the ASCII subtitute character in the UnprintablePlugin

* ❇️ Ignore common ASCII sep used in all sub CP for SuspicousRangePlugin

* 🎨 reformat md.py
  • Loading branch information
Ousret authored Sep 16, 2021
1 parent 0c52bfe commit 4db2367
Showing 1 changed file with 22 additions and 1 deletion.
23 changes: 22 additions & 1 deletion charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def feed(self, character: str) -> None:
character not in {"\n", "\t", "\r", "\v"}
and character.isprintable() is False
and character.isspace() is False
and ord(character) != 0x1A # Why? Its the ASCII substitute character.
):
self._unprintable_count += 1
self._character_count += 1
Expand Down Expand Up @@ -218,7 +219,27 @@ def eligible(self, character: str) -> bool:
def feed(self, character: str) -> None:
self._character_count += 1

if character.isspace() or is_punctuation(character):
if (
character.isspace()
or is_punctuation(character)
or character
in [
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
]
):
self._last_printable_seen = None
return

Expand Down

0 comments on commit 4db2367

Please sign in to comment.