Remove icu

alanshi · Jan 11, 2024 · b5143bb · b5143bb
1 parent f87b473
commit b5143bb
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -142,7 +142,7 @@ optional arguments:
   -p PROCESS_NUM, --process_num PROCESS_NUM
                         指定进程数，默认为4
   -c, --cchardet        使用cchardet方案,
-  -m, --mode            mode=1 mnbvc方案 , mode=2 cchardet方案(默认), mode3=3 pyicu
+  -m, --mode            mode=1 mnbvc方案 , mode=2 cchardet方案(默认)
   -i inputDirectory     inputDirectory为需要检测的目录
   -step PROCESS_STEP    执行步骤,1为编码检测,2为编码转换
   -r result_file_name   指定编码检测结果文件名

diff --git a/charset_mnbvc/api.py b/charset_mnbvc/api.py
@@ -10,7 +10,6 @@
 from .constant import (CCHARDECT_ENCODING_MAP, ENCODINGS, EXT_ENCODING,
                        REGEX_FEATURE_ALL, TIPS_CONTEXT_RANGE, MAX_ENCODING_SIZE, MAX_INVALID_BYTES_SIZE)
 
-import icu
 
 # compile makes it more efficient
 re_char_check = compile(REGEX_FEATURE_ALL)
@@ -69,7 +68,7 @@ def fix_data(s: str) -> list:
 def from_data(data, mode) -> str:
     """
     :param data: data
-    :param mode: 1:cchardet 2:mnbvc, 3:icu
+    :param mode: 1:cchardet 2:mnbvc
     :return: encoding
     """
     coding_name = get_cn_charset(
@@ -135,16 +134,16 @@ def scan_dir(folder_path, ext='.txt'):
     return sub_folders, files
 
 
-def check_by_icu(data):
-    """
-    :param data:data
-    :return: encoding
-    """
-    encoding = icu.CharsetDetector(data).detect().getName()
+# def check_by_icu(data):
+#     """
+#     :param data:data
+#     :return: encoding
+#     """
+#     encoding = icu.CharsetDetector(data).detect().getName()
 
-    converted_encoding = CCHARDECT_ENCODING_MAP.get(encoding)
+#     converted_encoding = CCHARDECT_ENCODING_MAP.get(encoding)
 
-    return converted_encoding
+#     return converted_encoding
 
 
 def check_by_cchardect(data):
@@ -218,7 +217,7 @@ def check_disorder_chars(file_path, threshold=0.1):
 def get_cn_charset(source_data, source_type="file", mode=1, special_encodings=None):
     """
     :param source_data: file path
-    :param mode: 1: mnbvc, 2: cchardet, 3: icu
+    :param mode: 1: mnbvc, 2: cchardet
     :param source_type: file or data
     :return: encoding
     """
@@ -259,8 +258,6 @@ def get_cn_charset(source_data, source_type="file", mode=1, special_encodings=No
                 data=data, special_encodings=special_encodings)
         elif mode == 2:
             encoding = check_by_cchardect(data=data)
-        elif mode == 3:
-            encoding = check_by_icu(data=data)
         else:
             sys.stderr.write(f'Error: mode {mode} is not supported.')
 

diff --git a/charset_mnbvc/version.py b/charset_mnbvc/version.py
@@ -1,4 +1,4 @@
 """version"""
 
-__version__ = '0.0.12'
+__version__ = '0.0.14'
 VERSION = __version__.split('.')
diff --git a/convert_files.py b/convert_files.py
@@ -9,7 +9,7 @@
 from tqdm import tqdm
 
 from charset_mnbvc import api, verify
-from icu import UnicodeString
+# from icu import UnicodeString
 
 BLOCK_SIZE = 1024 * 1024
 
@@ -82,15 +82,15 @@ def revert_files(file_path):
 
     return True
 
-def convert_file_to_utf8_use_icu(input_file, output_file, encoding):
-        # 打开二进制文件进行读取
-    with open(input_file, "rb") as f_input:
-        with open(output_file, "w") as f_output:
-            data = f_input.read()
-            # 将读取的数据转换为UTF-8编码
-            utf8_data = UnicodeString(data, encoding.upper())
-            # 将转换后的UTF-8数据写入输出文件
-            f_output.write(str(utf8_data))
+# def convert_file_to_utf8_use_icu(input_file, output_file, encoding):
+#         # 打开二进制文件进行读取
+#     with open(input_file, "rb") as f_input:
+#         with open(output_file, "w") as f_output:
+#             data = f_input.read()
+#             # 将读取的数据转换为UTF-8编码
+#             utf8_data = UnicodeString(data, encoding.upper())
+#             # 将转换后的UTF-8数据写入输出文件
+#             f_output.write(str(utf8_data))
 
 def convert_file_to_utf8(file):
     """
@@ -113,28 +113,28 @@ def convert_file_to_utf8(file):
                 f_out.write(f_in.read())
 
     except Exception as e:
-        # 检测encoding是否为gbk或者gb18030，调用pyicu进行转换
         msg = f"{file_path} {encoding} 转换到utf8失败, {e}"
-        if encoding.lower() in ["gbk", "gb18030"]:
-            try:
-                # encoding = "GBK"
-                # with open(raw_file_path, "rb") as f:
-                #     data = f.read()
-                #     encoding = api.from_data(data=data, mode=3)
-
-                convert_file_to_utf8_use_icu(raw_file_path, file_path, encoding)
-                msg = msg + f" 重新转换 {encoding}，使用 icu 成功"
-                return True, msg
-            except Exception as e:
-                msg = msg + f" 重新转换 {encoding}，使用 icu 失败"
-                os.remove(file_path)
-                return False, msg
-        else:
-            os.remove(file_path)
-            return False, msg
-
-        # os.remove(file_path)
-        # return False, msg
+        os.remove(file_path)
+        return False, msg
+
+        # 检测encoding是否为gbk或者gb18030，调用pyicu进行转换
+        # if encoding.lower() in ["gbk", "gb18030"]:
+        #     try:
+        #         # encoding = "GBK"
+        #         # with open(raw_file_path, "rb") as f:
+        #         #     data = f.read()
+        #         #     encoding = api.from_data(data=data, mode=3)
+
+        #         convert_file_to_utf8_use_icu(raw_file_path, file_path, encoding)
+        #         msg = msg + f" 重新转换 {encoding}，使用 icu 成功"
+        #         return True, msg
+        #     except Exception as e:
+        #         msg = msg + f" 重新转换 {encoding}，使用 icu 失败"
+        #         os.remove(file_path)
+        #         return False, msg
+        # else:
+        #     os.remove(file_path)
+        #     return False, msg
     return True, None
 
 

diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@
         "Source": "https://github.com/alanshi/charset_mnbvc/",
     },
     install_requires=[
-        'cchardet',
+        'faust-cchardet',
         'tqdm'
     ],
 )