Skip to content

Commit

Permalink
Improvement on probe chaos (#31)
Browse files Browse the repository at this point in the history
* Upper next to lower altern detection reviewed

* For each words verify the dominant symbol should not take more than 50% of word len

* Add copyright sign in punc sign list

* Add subtest in test_on_bytes

* replace backslash windows syntax to linux syntax path (paper features.py)

* bump version
  • Loading branch information
Ousret authored Dec 16, 2019
1 parent 48c2e6b commit a90a899
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 24 deletions.
10 changes: 5 additions & 5 deletions charset_normalizer/probe_chaos.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def __add__(self, other):
def _probe(self):

c__ = False
upper_lower_m = False
upper_lower_m = 0

for c, i_ in zip(self._string, range(0, len(self._string))):

Expand Down Expand Up @@ -197,13 +197,13 @@ def _probe(self):
continue

if (is_lower and self.previous_printable_letter.isupper()) or (is_upper and self.previous_printable_letter.islower()):
if not upper_lower_m:
upper_lower_m = True
if upper_lower_m < 2:
upper_lower_m += 1
else:
self.successive_upper_lower += 1
upper_lower_m = False
upper_lower_m = 0
else:
upper_lower_m = False
upper_lower_m = 0

if is_latin:
self.previous_encountered_unicode_range = u_name
Expand Down
5 changes: 4 additions & 1 deletion charset_normalizer/probe_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ def _probe(self):
if UnicodeRangeIdentify.is_range_secondary(u_name) is True:
c_ += u_occ

c_el = HashableCounter(el)

if (not is_latin_based and c_ > int(w_len / 4)) \
or (is_latin_based and len(el) >= 9 and c_el.most_common()[0][1] >= sum(c_el.values()) * 0.5) \
or (is_latin_based and c_ > int(w_len / 2)) \
or (UnicodeRangeIdentify.part_punc(el) > 0.4 and len(classification.keys()) > 1) \
or (not is_latin_based and UnicodeRangeIdentify.part_accent(el) > 0.4) \
Expand All @@ -65,5 +68,5 @@ def _probe(self):

@property
def ratio(self):
return len(self._suspicious) / self._nb_words if self._nb_words > 5 else 0.
return len(self._suspicious) / self._nb_words if self._nb_words >= 1 else 0.

2 changes: 1 addition & 1 deletion charset_normalizer/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def is_punc(letter):
return r_name is not None and \
("Punctuation" in r_name or
'Forms' in r_name or
letter in set('º¯—–‒‐⁃«‹?!;.:^$¥*»£¹¿~ª؟©±¡{}[]|½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒™℠¬‼⁇❝❞¶⁋√↑↓�¤`¨'))
letter in set('º¯—–‒‐⁃«‹?!;.:^$¥*»£¹¿~ª؟©±¡{}[]|½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒™℠¬‼⁇❝❞¶⁋√↑↓�¤©`¨'))

@staticmethod
@lru_cache(maxsize=8192)
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "1.3.3"
__version__ = "1.3.4"
VERSION = __version__.split('.')
2 changes: 1 addition & 1 deletion paper/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def does_it_matter(raw_bytes, detected_encoding, should_be_encoding):

for path in files_queue:

target_encoding_dir = path.split('\\')[-2].lower().replace('-', '_')
target_encoding_dir = path.split('/')[-2].lower().replace('-', '_')

if target_encoding_dir.startswith('windows_'):
target_encoding_dir = '_'.join(target_encoding_dir.split('_')[:2])
Expand Down
34 changes: 19 additions & 15 deletions test/test_probe_chaos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,24 @@ class TestProbeChaos(unittest.TestCase):

def test_not_gibberish(self):

self.assertLessEqual(
ProbeChaos('典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。').ratio,
0.
)

self.assertEqual(
ProbeChaos('العقلية , التنويم المغناطيسي و / أو الاقتراح').ratio,
0.
)

self.assertEqual(
ProbeChaos("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل").ratio,
0.
)
with self.subTest('Chinese Not Gibberish'):
self.assertLessEqual(
ProbeChaos(
'典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。').ratio,
0.
)

with self.subTest('Arabic Not Gibberish'):
self.assertEqual(
ProbeChaos('العقلية , التنويم المغناطيسي و / أو الاقتراح').ratio,
0.
)

with self.subTest('Arabic Styled Not Gibberish'):
self.assertEqual(
ProbeChaos("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل").ratio,
0.
)

def test_subtle_gibberish(self):

Expand All @@ -41,7 +45,7 @@ def test_subtle_gibberish(self):

self.assertLessEqual(
ProbeChaos("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v").ratio,
0.5
0.9
)

self.assertGreater(
Expand Down

0 comments on commit a90a899

Please sign in to comment.