From e0c594c91a9ecd5be8a04d90949df3276ee0befc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= Date: Thu, 23 May 2024 09:00:01 +0200 Subject: [PATCH] Use charset-normalizer for encoding detection Fixes #222 --- aeidon/encodings.py | 21 +++++---------------- aeidon/util.py | 14 +++++++------- setup-aeidon.py | 4 ++-- 3 files changed, 14 insertions(+), 25 deletions(-) diff --git a/aeidon/encodings.py b/aeidon/encodings.py index 41d5873e..17daac91 100644 --- a/aeidon/encodings.py +++ b/aeidon/encodings.py @@ -177,23 +177,12 @@ def detect(path): bom_encoding = detect_bom(path) if bom_encoding is not None: return bom_encoding - from chardet import universaldetector - detector = universaldetector.UniversalDetector() - with open(path, "rb") as f: - detector.reset() - for line in f: - detector.feed(line) - if detector.done: break - detector.close() - code = detector.result["encoding"] - if code is None: return None - try: - # chardet returns what seem to be IANA names. They need to be - # translated to their Python equivalents. Some of the encodings - # returned by chardet are not supported by Python. - return translate_code(code) - except ValueError: + from charset_normalizer import from_path + detector = from_path(path) + result = detector.best() + if result is None: return None + return result.encoding def detect_bom(path): """Return corresponding encoding if BOM found, else ``None``.""" diff --git a/aeidon/util.py b/aeidon/util.py index 69c15259..15c8bb39 100644 --- a/aeidon/util.py +++ b/aeidon/util.py @@ -113,11 +113,11 @@ def atomic_open(path, mode="w", *args, **kwargs): @aeidon.deco.once def chardet_available(): - """Return ``True`` if :mod:`chardet` module is available.""" + """Return ``True`` if :mod:`charset_normalizer` module is available.""" try: - import chardet # noqa + import charset_normalizer # noqa return True - except Exception: + except ImportError: return False def compare_versions(x, y): @@ -207,11 +207,11 @@ def flatten(lst): return flat_lst def get_chardet_version(): - """Return :mod:`chardet` version number as string or ``None``.""" + """Return :mod:`charset_normalizer` version number as string or ``None``.""" try: - import chardet - return chardet.__version__ - except Exception: + import charset_normalizer + return "charset_normalizer {}".format(charset_normalizer.__version__) + except ImportError: return None @aeidon.deco.once diff --git a/setup-aeidon.py b/setup-aeidon.py index 15723b9f..a6879350 100755 --- a/setup-aeidon.py +++ b/setup-aeidon.py @@ -24,8 +24,8 @@ license="GPL", packages=find_packages(exclude=["gaupol*", "*.test"]), package_data={"aeidon": ["data/*/*"]}, - python_requires=">=3.2.0", - install_requires=["chardet>=2.2.1"], + python_requires=">=3.5.0", + install_requires=["charset-normalizer>2.0"], ) shutil.rmtree("aeidon/data")