From c0d8938bd4c888a514b4eb62561c01aced4804d8 Mon Sep 17 00:00:00 2001 From: alan Date: Thu, 18 Jan 2024 16:39:55 +0800 Subject: [PATCH] Fix convert --- .pylintrc | 2 +- charset_mnbvc/api.py | 14 ++++++------- convert_files.py | 49 +++++++++++++++++--------------------------- 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/.pylintrc b/.pylintrc index 28cbd1a..5b983b2 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,2 +1,2 @@ [MESSAGES CONTROL] -disable=C0303,C0111,W0223,R0914,C0103,R0915,R0912,W0707,C0301,W0311,E0401,W0612,W0718,R1728,W0613,C0206,R1705,R1710,W0707,C0209,R1718,C0413,C0411 \ No newline at end of file +disable=C0303,C0111,W0223,R0914,C0103,R0915,R0912,W0707,C0301,W0311,E0401,W0612,W0718,R1728,W0613,C0206,R1705,R1710,W0707,C0209,R1718,C0413,C0411,E1101,W0127,W0621 \ No newline at end of file diff --git a/charset_mnbvc/api.py b/charset_mnbvc/api.py index 0b1d63d..b5d8779 100644 --- a/charset_mnbvc/api.py +++ b/charset_mnbvc/api.py @@ -160,7 +160,7 @@ def check_by_cchardect(data: bytes): ret = decode_check(data, "utf-8") if ret: converted_encoding = "utf_8" - except Exception as err: + except Exception: converted_encoding = converted_encoding return converted_encoding @@ -300,8 +300,8 @@ def decode_check(byte_sequence: bytes, encoding='utf-8', errors='strict') -> str :return: decoded characters """ try: - decode_data = byte_sequence.decode(encoding, errors) - return decode_data + decode_data = byte_sequence.decode(encoding) + return True, decode_data except UnicodeDecodeError as e: # 解码左侧有效字符 invalid_bytes = byte_sequence[e.start:e.end] @@ -320,13 +320,13 @@ def decode_check(byte_sequence: bytes, encoding='utf-8', errors='strict') -> str :TIPS_CONTEXT_RANGE] break else: # 超过最大异常字节数,提示更换解码方式 - raise UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes), + msg = UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes), "There are too many invalid bytes, please change codec.") # 格式化非法字节输出 invalid_str = "\\x" + '\\x'.join([hex(b)[2:].zfill(2) for b in invalid_bytes]) - raise UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes), - f"There are invalid bytes in the string \"{left_chars + invalid_str + right_chars}\"") - + msg = UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes), + f'There are invalid bytes in the string \"{left_chars + invalid_str + right_chars}\"') + return False, msg def test(): print("test") diff --git a/convert_files.py b/convert_files.py index 5253199..01774e4 100644 --- a/convert_files.py +++ b/convert_files.py @@ -104,6 +104,7 @@ def revert_files(file_path): def convert_file_to_utf8(file): + """ 将单个文件转换为utf-8编码 """ @@ -116,36 +117,24 @@ def convert_file_to_utf8(file): msg = f"{file_path} 转换失败, 编码格式错误:{encoding} 可能是文件内容为空!" os.remove(file_path) return False, msg + read_data = b'' + try: # overwrite raw file - with open(raw_file_path, "r", encoding=encoding) as f_in: - with open(file_path, "w", encoding="utf-8") as f_out: - f_out.write(f_in.read()) + with open(raw_file_path, "rb") as f_in: + read_data = f_in.read() + + with open(file_path, "w", encoding="utf-8") as f_out: + out_data = read_data.decode(encoding) + f_out.write(out_data) except Exception as e: - msg = f"{file_path} {encoding} 转换到utf8失败, {e}" + is_ok, check_msg = api.decode_check(read_data, encoding) + msg = f"{file_path} {encoding} 转换到utf8失败, {check_msg}" os.remove(file_path) return False, msg - # 检测encoding是否为gbk或者gb18030,调用pyicu进行转换 - # if encoding.lower() in ["gbk", "gb18030"]: - # try: - # # encoding = "GBK" - # # with open(raw_file_path, "rb") as f: - # # data = f.read() - # # encoding = api.from_data(data=data, mode=3) - - # convert_file_to_utf8_use_icu(raw_file_path, file_path, encoding) - # msg = msg + f" 重新转换 {encoding},使用 icu 成功" - # return True, msg - # except Exception as e: - # msg = msg + f" 重新转换 {encoding},使用 icu 失败" - # os.remove(file_path) - # return False, msg - # else: - # os.remove(file_path) - # return False, msg return True, None @@ -187,7 +176,6 @@ def run_convert_files(files, process_num): for future in futures: results.append(future.result()) pbar.update(1) - return results @@ -261,14 +249,15 @@ def main(): print(f"总文件数: {len(results)}") print(f"转换成功文件数: {success_count}") print(f"转换失败文件数: {failed_count}") - print(f"转换失败文件列表已保存至: {convert_result_file_name}") for msg in failed_msgs: - #sys.stderr.write(f"{msg}\n") - # 将转换错误结果保存至文件 - with open(convert_result_file_name, 'w', newline='') as file: - writer = csv.writer(file) - for row in results: - writer.writerow(row) + sys.stderr.write(f"{msg}\n") + + # 将转换错误结果保存至文件 + print(f"转换失败文件列表已保存至: {convert_result_file_name}") + with open(convert_result_file_name, 'w', newline='') as file: + writer = csv.writer(file) + for row in results: + writer.writerow(row) print("###################################### Step2 end ######################################")