From c0d8938bd4c888a514b4eb62561c01aced4804d8 Mon Sep 17 00:00:00 2001
From: alan <alan.shi86@gmail.com>
Date: Thu, 18 Jan 2024 16:39:55 +0800
Subject: [PATCH] Fix convert

---
 .pylintrc            |  2 +-
 charset_mnbvc/api.py | 14 ++++++-------
 convert_files.py     | 49 +++++++++++++++++---------------------------
 3 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 28cbd1a..5b983b2 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,2 +1,2 @@
 [MESSAGES CONTROL]
-disable=C0303,C0111,W0223,R0914,C0103,R0915,R0912,W0707,C0301,W0311,E0401,W0612,W0718,R1728,W0613,C0206,R1705,R1710,W0707,C0209,R1718,C0413,C0411
\ No newline at end of file
+disable=C0303,C0111,W0223,R0914,C0103,R0915,R0912,W0707,C0301,W0311,E0401,W0612,W0718,R1728,W0613,C0206,R1705,R1710,W0707,C0209,R1718,C0413,C0411,E1101,W0127,W0621
\ No newline at end of file
diff --git a/charset_mnbvc/api.py b/charset_mnbvc/api.py
index 0b1d63d..b5d8779 100644
--- a/charset_mnbvc/api.py
+++ b/charset_mnbvc/api.py
@@ -160,7 +160,7 @@ def check_by_cchardect(data: bytes):
             ret = decode_check(data, "utf-8")
             if ret:
                 converted_encoding = "utf_8"
-        except Exception as err:
+        except Exception:
             converted_encoding = converted_encoding
 
     return converted_encoding
@@ -300,8 +300,8 @@ def decode_check(byte_sequence: bytes, encoding='utf-8', errors='strict') -> str
     :return: decoded characters
     """
     try:
-        decode_data = byte_sequence.decode(encoding, errors)
-        return decode_data
+        decode_data = byte_sequence.decode(encoding)
+        return True, decode_data
     except UnicodeDecodeError as e:
         # 解码左侧有效字符
         invalid_bytes = byte_sequence[e.start:e.end]
@@ -320,13 +320,13 @@ def decode_check(byte_sequence: bytes, encoding='utf-8', errors='strict') -> str
                                   :TIPS_CONTEXT_RANGE]
                     break
         else:  # 超过最大异常字节数，提示更换解码方式
-            raise UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
+            msg = UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
                                      "There are too many invalid bytes, please change codec.")
         # 格式化非法字节输出
         invalid_str = "\\x" + '\\x'.join([hex(b)[2:].zfill(2) for b in invalid_bytes])
-        raise UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
-                                 f"There are invalid bytes in the string \"{left_chars + invalid_str + right_chars}\"")
-
+        msg = UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
+                                 f'There are invalid bytes in the string \"{left_chars + invalid_str + right_chars}\"')
+        return False, msg
 
 def test():
     print("test")
diff --git a/convert_files.py b/convert_files.py
index 5253199..01774e4 100644
--- a/convert_files.py
+++ b/convert_files.py
@@ -104,6 +104,7 @@ def revert_files(file_path):
 
 
 def convert_file_to_utf8(file):
+
     """
     将单个文件转换为utf-8编码
     """
@@ -116,36 +117,24 @@ def convert_file_to_utf8(file):
         msg = f"{file_path} 转换失败, 编码格式错误:{encoding} 可能是文件内容为空!"
         os.remove(file_path)
         return False, msg
+    read_data = b''
+
 
     try:
         # overwrite raw file
-        with open(raw_file_path, "r", encoding=encoding) as f_in:
-            with open(file_path, "w", encoding="utf-8") as f_out:
-                f_out.write(f_in.read())
+        with open(raw_file_path, "rb") as f_in:
+            read_data = f_in.read()
+
+        with open(file_path, "w", encoding="utf-8") as f_out:
+            out_data = read_data.decode(encoding)
+            f_out.write(out_data)
 
     except Exception as e:
-        msg = f"{file_path} {encoding} 转换到utf8失败, {e}"
+        is_ok, check_msg = api.decode_check(read_data, encoding)
+        msg = f"{file_path} {encoding} 转换到utf8失败, {check_msg}"
         os.remove(file_path)
         return False, msg
 
-        # 检测encoding是否为gbk或者gb18030，调用pyicu进行转换
-        # if encoding.lower() in ["gbk", "gb18030"]:
-        #     try:
-        #         # encoding = "GBK"
-        #         # with open(raw_file_path, "rb") as f:
-        #         #     data = f.read()
-        #         #     encoding = api.from_data(data=data, mode=3)
-
-        #         convert_file_to_utf8_use_icu(raw_file_path, file_path, encoding)
-        #         msg = msg + f" 重新转换 {encoding}，使用 icu 成功"
-        #         return True, msg
-        #     except Exception as e:
-        #         msg = msg + f" 重新转换 {encoding}，使用 icu 失败"
-        #         os.remove(file_path)
-        #         return False, msg
-        # else:
-        #     os.remove(file_path)
-        #     return False, msg
     return True, None
 
 
@@ -187,7 +176,6 @@ def run_convert_files(files, process_num):
             for future in futures:
                 results.append(future.result())
                 pbar.update(1)
-
     return results
 
 
@@ -261,14 +249,15 @@ def main():
         print(f"总文件数: {len(results)}")
         print(f"转换成功文件数: {success_count}")
         print(f"转换失败文件数: {failed_count}")
-        print(f"转换失败文件列表已保存至: {convert_result_file_name}")
         for msg in failed_msgs:
-            #sys.stderr.write(f"{msg}\n")
-            # 将转换错误结果保存至文件
-            with open(convert_result_file_name, 'w', newline='') as file:
-                writer = csv.writer(file)
-                for row in results:
-                    writer.writerow(row)
+            sys.stderr.write(f"{msg}\n")
+
+        # 将转换错误结果保存至文件
+        print(f"转换失败文件列表已保存至: {convert_result_file_name}")
+        with open(convert_result_file_name, 'w', newline='') as file:
+            writer = csv.writer(file)
+            for row in results:
+                writer.writerow(row)
 
         print("###################################### Step2 end ######################################")