Fix hmt encoding (#35)

Sigmmma · May 1, 2020 · 4a0a1e3 · 4a0a1e3
2 parents b18e984 + e497b72
commit 4a0a1e3
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 23 deletions.
diff --git a/CHANGELOG.MD b/CHANGELOG.MD
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 This project also inherits changes from [Binilla](https://github.com/Sigmmma/binilla).
 
+## [1.9.3]
+### Changed
+ - Don't use charsetnormalizer. Fixes issue with not being able to install on Windows.
+
 ## [1.9.2]
 ### Changed
  - Fix encoding problems with hmt files that aren't utf-16-le encoded.

diff --git a/mozzarilla/__init__.py b/mozzarilla/__init__.py
@@ -12,6 +12,6 @@
 # ##############
 __author__ = "Sigmmma"
 #           YYYY.MM.DD
-__date__ = "2020.04.28"
-__version__ = (1, 9, 2)
+__date__ = "2020.05.01"
+__version__ = (1, 9, 3)
 __website__ = "https://github.com/Sigmmma/mozzarilla"
diff --git a/mozzarilla/windows/tools/compile_hud_message_text.py b/mozzarilla/windows/tools/compile_hud_message_text.py
@@ -8,7 +8,6 @@
 #
 
 import os
-import charset_normalizer
 
 from pathlib import Path
 from traceback import format_exc
@@ -17,6 +16,30 @@
 from supyr_struct.util import is_path_empty
 from binilla.windows.filedialog import askopenfilename
 
+def hacky_detect_encoding(fp):
+    fp = Path(fp)
+    with fp.open("rb") as f:
+        data = f.read(2)
+
+    # Check if the file contains any of the two utf-16 BOMs
+    if data[0] == 255 and data[1] == 254:
+        encoding = "utf-16-le"
+    elif data[1] == 254 and data[0] == 255:
+        encoding = "utf-16-be"
+    else:
+        # If not we default to latin-1
+        with fp.open("rb") as f:
+            data = f.read()
+            encoding = "latin-1"
+
+            # But, if we find a null byte while checking every other byte,
+            # we assume utf-16 without BOM
+            for i in range(1, len(data), 2):
+                if data[i] == 0:
+                    encoding = 'utf-16-le'
+                    break
+
+    return encoding
 
 def hud_message_text_from_hmt(app, fp=None):
     load_dir = app.last_data_load_dir
@@ -46,15 +69,8 @@ def hud_message_text_from_hmt(app, fp=None):
         print("Creating hud_message_text from this hmt file:")
         print("    %s" % fp)
 
-        with fp.open("rb") as f:
-            contents = f.read()
-            guess = charset_normalizer.detect(contents)
-            # utf-16 is our fallback as that is the proper encoding for
-            # these files and has the biggest detection failure rate.
-            hmt_string_data = contents.decode(guess['encoding'] or "utf-16")
-            # Reading files this way doesn't remove carriage returns.
-            # We have to wipe them out like this.
-            hmt_string_data = hmt_string_data.replace("\r", "")
+        with fp.open("r", encoding=hacky_detect_encoding(fp)) as f:
+            hmt_string_data = f.read()
 
     except Exception:
         print(format_exc())

diff --git a/mozzarilla/windows/tools/compile_strings.py b/mozzarilla/windows/tools/compile_strings.py
@@ -17,6 +17,8 @@
 from supyr_struct.util import is_path_empty
 from binilla.windows.filedialog import askopenfilename
 
+from mozzarilla.windows.tools.compile_hud_message_text import hacky_detect_encoding
+
 
 def strings_from_txt(app, fp=None):
     load_dir = app.last_data_load_dir
@@ -45,15 +47,9 @@ def strings_from_txt(app, fp=None):
         tag_ext = "unicode_string_list"
         tag_cls = "ustr"
 
-        with fp.open("rb") as f:
-            data = f.read(2)
+        encoding = hacky_detect_encoding(fp)
 
-        if data[0] == 255 and data[1] == 254:
-            encoding = "utf-16-le"
-        elif data[1] == 254 and data[0] == 255:
-            encoding = "utf-16-be"
-        else:
-            encoding = "latin-1"
+        if encoding == "latin-1":
             tag_ext = "string_list"
             tag_cls = "str#"
 

diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,3 @@ supyr_struct
 binilla
 reclaimer
 arbytmap
-charset_normalizer
diff --git a/setup.py b/setup.py
@@ -54,8 +54,8 @@
         },
     platforms=["POSIX", "Windows"],
     keywords=["binilla", "binary", "data structure"],
-    install_requires=['reclaimer', 'binilla', 'arbytmap', 'supyr_struct', "charset_normalizer"],
-    requires=['reclaimer', 'arbytmap', 'binilla', "charset_normalizer"],
+    install_requires=['reclaimer', 'binilla', 'arbytmap', 'supyr_struct'],
+    requires=['reclaimer', 'arbytmap', 'binilla'],
     provides=['mozzarilla'],
     python_requires=">=3.5",
     classifiers=[