Fix missing code

alanshi · Dec 14, 2023 · d2512b2 · d2512b2
1 parent 7746006
commit d2512b2
Showing 1 changed file with 82 additions and 2 deletions.
diff --git a/charset_mnbvc/api.py b/charset_mnbvc/api.py
@@ -8,14 +8,32 @@
 
 from .common_utils import print_table
 from .constant import (CCHARDECT_ENCODING_MAP, ENCODINGS, EXT_ENCODING,
-                       REGEX_FEATURE_ALL)
+                       REGEX_FEATURE_ALL, TIPS_CONTEXT_RANGE, MAX_ENCODING_SIZE, MAX_INVALID_BYTES_SIZE)
 
 import icu
 
 # compile makes it more efficient
 re_char_check = compile(REGEX_FEATURE_ALL)
 
 
+def is_perceivable(s):
+    """
+    Checks if all characters in a string are perceivable by the user.
+    Perceivable characters include printable characters, spaces, tabs, and newlines.
+
+    Args:
+    s (str): The string to check.
+
+    Returns:
+    bool: True if all characters are perceivable, False otherwise.
+    """
+    for char in s:
+        # Check if the character is not perceivable
+        if not (char.isprintable() or char in [' ', '\t', '\n']):
+            return char.encode('unicode_escape').decode()
+    return True
+
+
 def has_control_characters(text):
     """
     :param text: text
@@ -116,6 +134,7 @@ def scan_dir(folder_path, ext='.txt'):
         files.extend(f)
     return sub_folders, files
 
+
 def check_by_icu(data):
     """
     :param data:data
@@ -127,6 +146,7 @@ def check_by_icu(data):
 
     return converted_encoding
 
+
 def check_by_cchardect(data):
     """
     :param data: data
@@ -227,11 +247,16 @@ def get_cn_charset(source_data, source_type="file", mode=1, special_encodings=No
             # if has_control_characters(data.decode("unicode_escape")):
             #     return "UNKNOWN"
 
+            # return_is_perceivable = is_perceivable(data.decode("unicode_escape"))
+            # if return_is_perceivable:
+            #     return "UNKNOWN: %s" % return_is_perceivable
+
         except Exception as err:
             pass
 
         if mode == 1:
-            encoding = check_by_mnbvc(data=data, special_encodings=special_encodings)
+            encoding = check_by_mnbvc(
+                data=data, special_encodings=special_encodings)
         elif mode == 2:
             encoding = check_by_cchardect(data=data)
         elif mode == 3:
@@ -273,5 +298,60 @@ def convert_encoding(source_data, source_encoding, target_encoding="utf-8"):
     return data
 
 
+def find_invalid_bytes(byte_sequence: bytes, decoding="gbk"):
+    """
+    :param byte_sequence: input bytes
+    :param decoding: input decoding
+    :return:
+    """
+    try:
+        byte_sequence.decode(decoding)
+        print("No decoding errors found, the byte sequence is valid.")
+    except UnicodeDecodeError as e:
+        # 解码左侧有效字符
+        invalid_bytes = byte_sequence[e.start:e.end]
+        left_chars = ''
+        index_offset = TIPS_CONTEXT_RANGE
+        while len(left_chars) < TIPS_CONTEXT_RANGE:
+            index_offset += 1
+            if e.start - index_offset < 0:
+                left_chars = byte_sequence[:e.start].decode(decoding)
+                break
+            try:
+                left_chars = byte_sequence[e.start -
+                                           index_offset:e.start].decode(decoding)
+            except UnicodeDecodeError as _:
+                pass
+        # 解码右侧有效字符
+        right_chars = ''
+        right_curr_index = e.end
+        index_offset = TIPS_CONTEXT_RANGE
+        while len(right_chars) < TIPS_CONTEXT_RANGE:
+            index_offset += 1
+            if right_curr_index + index_offset >= len(byte_sequence):
+                break
+            try:
+                right_chars = byte_sequence[right_curr_index: right_curr_index +
+                                            index_offset].decode(decoding)
+            except UnicodeDecodeError as right_e:
+                # 超过提示上下文最大字节数时，更新异常字节的边界
+                if index_offset >= MAX_ENCODING_SIZE * TIPS_CONTEXT_RANGE:
+                    invalid_bytes += byte_sequence[right_curr_index:right_curr_index + right_e.end]
+                    right_curr_index += right_e.end
+                    index_offset = TIPS_CONTEXT_RANGE
+                    # 超过最大异常字节数时，放弃解码右侧字符
+                    if len(invalid_bytes) >= MAX_INVALID_BYTES_SIZE:
+                        right_chars = ''
+                        break
+        print(f"Error message: {e}")
+        if right_chars and e.end + len(invalid_bytes) != len(byte_sequence):
+            # 异常字节输出格式化
+            invalid_str = f"'{' '.join([hex(b)[2:].zfill(2) for b in invalid_bytes])}'"
+            print(
+                f"There are invalid bytes in the string: {left_chars + invalid_str + right_chars}")
+        else:  # 超过最大异常字节数，提示更换解码方式
+            print(f"There are too many invalid bytes, please change codec.")
+
+
 def test():
     print("test")