Skip to content

Commit

Permalink
Fix missing code
Browse files Browse the repository at this point in the history
  • Loading branch information
alanshi committed Dec 14, 2023
1 parent 7746006 commit d2512b2
Showing 1 changed file with 82 additions and 2 deletions.
84 changes: 82 additions & 2 deletions charset_mnbvc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,32 @@

from .common_utils import print_table
from .constant import (CCHARDECT_ENCODING_MAP, ENCODINGS, EXT_ENCODING,
REGEX_FEATURE_ALL)
REGEX_FEATURE_ALL, TIPS_CONTEXT_RANGE, MAX_ENCODING_SIZE, MAX_INVALID_BYTES_SIZE)

import icu

# compile makes it more efficient
re_char_check = compile(REGEX_FEATURE_ALL)


def is_perceivable(s):
"""
Checks if all characters in a string are perceivable by the user.
Perceivable characters include printable characters, spaces, tabs, and newlines.
Args:
s (str): The string to check.
Returns:
bool: True if all characters are perceivable, False otherwise.
"""
for char in s:
# Check if the character is not perceivable
if not (char.isprintable() or char in [' ', '\t', '\n']):
return char.encode('unicode_escape').decode()
return True


def has_control_characters(text):
"""
:param text: text
Expand Down Expand Up @@ -116,6 +134,7 @@ def scan_dir(folder_path, ext='.txt'):
files.extend(f)
return sub_folders, files


def check_by_icu(data):
"""
:param data:data
Expand All @@ -127,6 +146,7 @@ def check_by_icu(data):

return converted_encoding


def check_by_cchardect(data):
"""
:param data: data
Expand Down Expand Up @@ -227,11 +247,16 @@ def get_cn_charset(source_data, source_type="file", mode=1, special_encodings=No
# if has_control_characters(data.decode("unicode_escape")):
# return "UNKNOWN"

# return_is_perceivable = is_perceivable(data.decode("unicode_escape"))
# if return_is_perceivable:
# return "UNKNOWN: %s" % return_is_perceivable

except Exception as err:
pass

if mode == 1:
encoding = check_by_mnbvc(data=data, special_encodings=special_encodings)
encoding = check_by_mnbvc(
data=data, special_encodings=special_encodings)
elif mode == 2:
encoding = check_by_cchardect(data=data)
elif mode == 3:
Expand Down Expand Up @@ -273,5 +298,60 @@ def convert_encoding(source_data, source_encoding, target_encoding="utf-8"):
return data


def find_invalid_bytes(byte_sequence: bytes, decoding="gbk"):
"""
:param byte_sequence: input bytes
:param decoding: input decoding
:return:
"""
try:
byte_sequence.decode(decoding)
print("No decoding errors found, the byte sequence is valid.")
except UnicodeDecodeError as e:
# 解码左侧有效字符
invalid_bytes = byte_sequence[e.start:e.end]
left_chars = ''
index_offset = TIPS_CONTEXT_RANGE
while len(left_chars) < TIPS_CONTEXT_RANGE:
index_offset += 1
if e.start - index_offset < 0:
left_chars = byte_sequence[:e.start].decode(decoding)
break
try:
left_chars = byte_sequence[e.start -
index_offset:e.start].decode(decoding)
except UnicodeDecodeError as _:
pass
# 解码右侧有效字符
right_chars = ''
right_curr_index = e.end
index_offset = TIPS_CONTEXT_RANGE
while len(right_chars) < TIPS_CONTEXT_RANGE:
index_offset += 1
if right_curr_index + index_offset >= len(byte_sequence):
break
try:
right_chars = byte_sequence[right_curr_index: right_curr_index +
index_offset].decode(decoding)
except UnicodeDecodeError as right_e:
# 超过提示上下文最大字节数时,更新异常字节的边界
if index_offset >= MAX_ENCODING_SIZE * TIPS_CONTEXT_RANGE:
invalid_bytes += byte_sequence[right_curr_index:right_curr_index + right_e.end]
right_curr_index += right_e.end
index_offset = TIPS_CONTEXT_RANGE
# 超过最大异常字节数时,放弃解码右侧字符
if len(invalid_bytes) >= MAX_INVALID_BYTES_SIZE:
right_chars = ''
break
print(f"Error message: {e}")
if right_chars and e.end + len(invalid_bytes) != len(byte_sequence):
# 异常字节输出格式化
invalid_str = f"'{' '.join([hex(b)[2:].zfill(2) for b in invalid_bytes])}'"
print(
f"There are invalid bytes in the string: {left_chars + invalid_str + right_chars}")
else: # 超过最大异常字节数,提示更换解码方式
print(f"There are too many invalid bytes, please change codec.")


def test():
print("test")

0 comments on commit d2512b2

Please sign in to comment.