优化中英文检测(剔除有简体字对应的繁体范围)

alanshi · Oct 17, 2024 · b2385ac · b2385ac
1 parent 67aa221
commit b2385ac
Show file tree

Hide file tree

Showing 8 changed files with 29 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -37,10 +37,10 @@ print(f"文件名: {file_path}, 编码: {coding_name}")
 ```
 from charset_mnbvc import api
 
-file_path = "test.txt"
-ret, percentage = api.check_zh_en(file_path)
-print(f"中英文文档: {ret}, 比例: {percentage}")
-
+with open("tests/fixtures/10.txt", "rb") as f:
+    data = f.read()
+    ret, percentage = api.check_zh_en(file_path)
+    print(f"是否为中英文文档: {ret}, 比例: {percentage}")
 ```
 
 ##### 获取二进制数据编码

diff --git a/charset_mnbvc/api.py b/charset_mnbvc/api.py
@@ -17,11 +17,14 @@
     MAX_INVALID_BYTES_SIZE,
     BINARY_EXTENSIONS,
     REGEX_ZH_EN,
+    EXCLUDED_UNICODE_CHARS
+
 )
 
 # compile makes it more efficient
 re_char_check = compile(REGEX_FEATURE_ALL)
 re_zh_en = compile(REGEX_ZH_EN)
+re_zh_en_exclude = compile(EXCLUDED_UNICODE_CHARS)
 
 def check_zh_en(data) -> bool:
     """
@@ -31,6 +34,7 @@ def check_zh_en(data) -> bool:
     Returns:
     bool: True if the data contains Chinese and English characters, False otherwise.
     """
+
     encoding = check_by_cchardect(data)
     if not encoding:
         return False, 0
@@ -41,8 +45,15 @@ def check_zh_en(data) -> bool:
     # count the number of Chinese and English characters
     TIPS_CONTEXT_RANGE = 96
     for char in data:
+
+        #要排除的字符集(包含对应的简体中文的繁体字)
         if re_zh_en.match(char):
             zh_en_count += 1
+        if re_zh_en_exclude.match(char):
+            zh_en_count -= 1
+
+
+
 
     percentage = (zh_en_count / total_bytes) * 100
     ret = True if percentage > TIPS_CONTEXT_RANGE else False

diff --git a/charset_mnbvc/constant.py b/charset_mnbvc/constant.py
diff --git a/examples/check_zh_en.py b/examples/check_zh_en.py
@@ -24,7 +24,8 @@ def check_zh_en_by_file(file_path):
 
 if __name__ == "__main__":
     # load directory
-    folder_path = "tests/fixtures/"
-    file_path = "tests/fixtures/18.txt"
+
+    folder_path = "/Users/alan/Downloads/text.output"
+    # file_path = "tests/fixtures/18.txt"
     check_zh_en_by_folder(folder_path)
-    check_zh_en_by_file(file_path)
+    # check_zh_en_by_file(file_path)
diff --git a/scripts/fanti_collection.py b/scripts/fanti_collection.py
diff --git a/tests/fixtures/11.txt b/tests/fixtures/11.txt
@@ -0,0 +1 @@
+'䯀', '鮣', '龑', '鵟', '霑', '諡', '椹', '鵰', '鱲', '輋', '𨊰', '𨊸', '綖', '篸', '𫟰', '頫', '睍', '隤', '塿', '娙', '巠', '鄩', '嵽', '巘', '顗', '廞', '彄', '鷟', '鱀', '晛', '暐', '櫍', '鮆', '璗', '熰', '燀', '燖', '鸑', '鶠', '頠', '埨', '塸', '墠', '頵', '鵏', '頔', '鷭', '篢', '勣', '軏', '輗', '輮', '軝', '輶', '醲', '闉', '闑', '饘', '餗', '駓', '駼', '駪', '駉', '騊', '騵', '駃', '騱', '騄', '馼', '騠', '騞', '驎', '鮡', '鯻', '鮈', '鮠', '鱚', '鮀', '鰤', '鰊', '鰶', '鶱', '齘', '齯', '齮', '齼', '澫', '漍', '浿', '瓅', '僤', '璕', '蝀', '襀', '訏', '謏', '詷', '諴', '諲', '譓', '諓', '諟', '譞', '詪', '詝', '
diff --git a/tests/fixtures/test_ft.txt b/tests/fixtures/test_ft.txt
@@ -0,0 +1 @@
+繁體字，與簡體字相對，是結構相對複雜的漢字書寫字體，一般筆畫較多。在漢字簡化的過程中，一些漢字會簡化成簡單好寫的字體，稱為「簡化字」，而繁體字一詞就在這個過程中用來稱呼與其對應的未被簡化的漢字。而傳承字是指未被簡化的漢字。
diff --git a/tests/fixtures/test_jp.txt b/tests/fixtures/test_jp.txt
@@ -0,0 +1 @@
+にほんご
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		'䯀', '鮣', '龑', '鵟', '霑', '諡', '椹', '鵰', '鱲', '輋', '𨊰', '𨊸', '綖', '篸', '𫟰', '頫', '睍', '隤', '塿', '娙', '巠', '鄩', '嵽', '巘', '顗', '廞', '彄', '鷟', '鱀', '晛', '暐', '櫍', '鮆', '璗', '熰', '燀', '燖', '鸑', '鶠', '頠', '埨', '塸', '墠', '頵', '鵏', '頔', '鷭', '篢', '勣', '軏', '輗', '輮', '軝', '輶', '醲', '闉', '闑', '饘', '餗', '駓', '駼', '駪', '駉', '騊', '騵', '駃', '騱', '騄', '馼', '騠', '騞', '驎', '鮡', '鯻', '鮈', '鮠', '鱚', '鮀', '鰤', '鰊', '鰶', '鶱', '齘', '齯', '齮', '齼', '澫', '漍', '浿', '瓅', '僤', '璕', '蝀', '襀', '訏', '謏', '詷', '諴', '諲', '譓', '諓', '諟', '譞', '詪', '詝', '
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		繁體字，與簡體字相對，是結構相對複雜的漢字書寫字體，一般筆畫較多。在漢字簡化的過程中，一些漢字會簡化成簡單好寫的字體，稱為「簡化字」，而繁體字一詞就在這個過程中用來稱呼與其對應的未被簡化的漢字。而傳承字是指未被簡化的漢字。