Skip to content

Commit

Permalink
优化中英文检测(剔除有简体字对应的繁体范围)
Browse files Browse the repository at this point in the history
  • Loading branch information
alanshi committed Oct 17, 2024
1 parent 67aa221 commit b2385ac
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 8 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ print(f"文件名: {file_path}, 编码: {coding_name}")
```
from charset_mnbvc import api
file_path = "test.txt"
ret, percentage = api.check_zh_en(file_path)
print(f"中英文文档: {ret}, 比例: {percentage}")
with open("tests/fixtures/10.txt", "rb") as f:
data = f.read()
ret, percentage = api.check_zh_en(file_path)
print(f"是否为中英文文档: {ret}, 比例: {percentage}")
```

##### 获取二进制数据编码
Expand Down
11 changes: 11 additions & 0 deletions charset_mnbvc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@
MAX_INVALID_BYTES_SIZE,
BINARY_EXTENSIONS,
REGEX_ZH_EN,
EXCLUDED_UNICODE_CHARS

)

# compile makes it more efficient
re_char_check = compile(REGEX_FEATURE_ALL)
re_zh_en = compile(REGEX_ZH_EN)
re_zh_en_exclude = compile(EXCLUDED_UNICODE_CHARS)

def check_zh_en(data) -> bool:
"""
Expand All @@ -31,6 +34,7 @@ def check_zh_en(data) -> bool:
Returns:
bool: True if the data contains Chinese and English characters, False otherwise.
"""

encoding = check_by_cchardect(data)
if not encoding:
return False, 0
Expand All @@ -41,8 +45,15 @@ def check_zh_en(data) -> bool:
# count the number of Chinese and English characters
TIPS_CONTEXT_RANGE = 96
for char in data:

#要排除的字符集(包含对应的简体中文的繁体字)
if re_zh_en.match(char):
zh_en_count += 1
if re_zh_en_exclude.match(char):
zh_en_count -= 1




percentage = (zh_en_count / total_bytes) * 100
ret = True if percentage > TIPS_CONTEXT_RANGE else False
Expand Down
4 changes: 3 additions & 1 deletion charset_mnbvc/constant.py

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions examples/check_zh_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def check_zh_en_by_file(file_path):

if __name__ == "__main__":
# load directory
folder_path = "tests/fixtures/"
file_path = "tests/fixtures/18.txt"

folder_path = "/Users/alan/Downloads/text.output"
# file_path = "tests/fixtures/18.txt"
check_zh_en_by_folder(folder_path)
check_zh_en_by_file(file_path)
# check_zh_en_by_file(file_path)
4 changes: 4 additions & 0 deletions scripts/fanti_collection.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/fixtures/11.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'䯀', '鮣', '龑', '鵟', '霑', '諡', '椹', '鵰', '鱲', '輋', '𨊰', '𨊸', '綖', '篸', '𫟰', '頫', '睍', '隤', '塿', '娙', '巠', '鄩', '嵽', '巘', '顗', '廞', '彄', '鷟', '鱀', '晛', '暐', '櫍', '鮆', '璗', '熰', '燀', '燖', '鸑', '鶠', '頠', '埨', '塸', '墠', '頵', '鵏', '頔', '鷭', '篢', '勣', '軏', '輗', '輮', '軝', '輶', '醲', '闉', '闑', '饘', '餗', '駓', '駼', '駪', '駉', '騊', '騵', '駃', '騱', '騄', '馼', '騠', '騞', '驎', '鮡', '鯻', '鮈', '鮠', '鱚', '鮀', '鰤', '鰊', '鰶', '鶱', '齘', '齯', '齮', '齼', '澫', '漍', '浿', '瓅', '僤', '璕', '蝀', '襀', '訏', '謏', '詷', '諴', '諲', '譓', '諓', '諟', '譞', '詪', '詝', '
1 change: 1 addition & 0 deletions tests/fixtures/test_ft.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
繁體字,與簡體字相對,是結構相對複雜的漢字書寫字體,一般筆畫較多。在漢字簡化的過程中,一些漢字會簡化成簡單好寫的字體,稱為「簡化字」,而繁體字一詞就在這個過程中用來稱呼與其對應的未被簡化的漢字。而傳承字是指未被簡化的漢字。
1 change: 1 addition & 0 deletions tests/fixtures/test_jp.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
にほんご

0 comments on commit b2385ac

Please sign in to comment.