Skip to content

Commit

Permalink
Fix convert
Browse files Browse the repository at this point in the history
  • Loading branch information
alanshi committed Jan 18, 2024
1 parent 646411e commit c0d8938
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 38 deletions.
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[MESSAGES CONTROL]
disable=C0303,C0111,W0223,R0914,C0103,R0915,R0912,W0707,C0301,W0311,E0401,W0612,W0718,R1728,W0613,C0206,R1705,R1710,W0707,C0209,R1718,C0413,C0411
disable=C0303,C0111,W0223,R0914,C0103,R0915,R0912,W0707,C0301,W0311,E0401,W0612,W0718,R1728,W0613,C0206,R1705,R1710,W0707,C0209,R1718,C0413,C0411,E1101,W0127,W0621
14 changes: 7 additions & 7 deletions charset_mnbvc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def check_by_cchardect(data: bytes):
ret = decode_check(data, "utf-8")
if ret:
converted_encoding = "utf_8"
except Exception as err:
except Exception:
converted_encoding = converted_encoding

return converted_encoding
Expand Down Expand Up @@ -300,8 +300,8 @@ def decode_check(byte_sequence: bytes, encoding='utf-8', errors='strict') -> str
:return: decoded characters
"""
try:
decode_data = byte_sequence.decode(encoding, errors)
return decode_data
decode_data = byte_sequence.decode(encoding)
return True, decode_data
except UnicodeDecodeError as e:
# 解码左侧有效字符
invalid_bytes = byte_sequence[e.start:e.end]
Expand All @@ -320,13 +320,13 @@ def decode_check(byte_sequence: bytes, encoding='utf-8', errors='strict') -> str
:TIPS_CONTEXT_RANGE]
break
else: # 超过最大异常字节数,提示更换解码方式
raise UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
msg = UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
"There are too many invalid bytes, please change codec.")
# 格式化非法字节输出
invalid_str = "\\x" + '\\x'.join([hex(b)[2:].zfill(2) for b in invalid_bytes])
raise UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
f"There are invalid bytes in the string \"{left_chars + invalid_str + right_chars}\"")

msg = UnicodeDecodeError(encoding, invalid_bytes, e.start, e.start + len(invalid_bytes),
f'There are invalid bytes in the string \"{left_chars + invalid_str + right_chars}\"')
return False, msg

def test():
print("test")
49 changes: 19 additions & 30 deletions convert_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def revert_files(file_path):


def convert_file_to_utf8(file):

"""
将单个文件转换为utf-8编码
"""
Expand All @@ -116,36 +117,24 @@ def convert_file_to_utf8(file):
msg = f"{file_path} 转换失败, 编码格式错误:{encoding} 可能是文件内容为空!"
os.remove(file_path)
return False, msg
read_data = b''


try:
# overwrite raw file
with open(raw_file_path, "r", encoding=encoding) as f_in:
with open(file_path, "w", encoding="utf-8") as f_out:
f_out.write(f_in.read())
with open(raw_file_path, "rb") as f_in:
read_data = f_in.read()

with open(file_path, "w", encoding="utf-8") as f_out:
out_data = read_data.decode(encoding)
f_out.write(out_data)

except Exception as e:
msg = f"{file_path} {encoding} 转换到utf8失败, {e}"
is_ok, check_msg = api.decode_check(read_data, encoding)
msg = f"{file_path} {encoding} 转换到utf8失败, {check_msg}"
os.remove(file_path)
return False, msg

# 检测encoding是否为gbk或者gb18030,调用pyicu进行转换
# if encoding.lower() in ["gbk", "gb18030"]:
# try:
# # encoding = "GBK"
# # with open(raw_file_path, "rb") as f:
# # data = f.read()
# # encoding = api.from_data(data=data, mode=3)

# convert_file_to_utf8_use_icu(raw_file_path, file_path, encoding)
# msg = msg + f" 重新转换 {encoding},使用 icu 成功"
# return True, msg
# except Exception as e:
# msg = msg + f" 重新转换 {encoding},使用 icu 失败"
# os.remove(file_path)
# return False, msg
# else:
# os.remove(file_path)
# return False, msg
return True, None


Expand Down Expand Up @@ -187,7 +176,6 @@ def run_convert_files(files, process_num):
for future in futures:
results.append(future.result())
pbar.update(1)

return results


Expand Down Expand Up @@ -261,14 +249,15 @@ def main():
print(f"总文件数: {len(results)}")
print(f"转换成功文件数: {success_count}")
print(f"转换失败文件数: {failed_count}")
print(f"转换失败文件列表已保存至: {convert_result_file_name}")
for msg in failed_msgs:
#sys.stderr.write(f"{msg}\n")
# 将转换错误结果保存至文件
with open(convert_result_file_name, 'w', newline='') as file:
writer = csv.writer(file)
for row in results:
writer.writerow(row)
sys.stderr.write(f"{msg}\n")

# 将转换错误结果保存至文件
print(f"转换失败文件列表已保存至: {convert_result_file_name}")
with open(convert_result_file_name, 'w', newline='') as file:
writer = csv.writer(file)
for row in results:
writer.writerow(row)

print("###################################### Step2 end ######################################")

Expand Down

0 comments on commit c0d8938

Please sign in to comment.