Skip to content

Commit

Permalink
Remove icu
Browse files Browse the repository at this point in the history
  • Loading branch information
alanshi committed Jan 11, 2024
1 parent f87b473 commit b5143bb
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 47 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ optional arguments:
-p PROCESS_NUM, --process_num PROCESS_NUM
指定进程数,默认为4
-c, --cchardet 使用cchardet方案,
-m, --mode mode=1 mnbvc方案 , mode=2 cchardet方案(默认), mode3=3 pyicu
-m, --mode mode=1 mnbvc方案 , mode=2 cchardet方案(默认)
-i inputDirectory inputDirectory为需要检测的目录
-step PROCESS_STEP 执行步骤,1为编码检测,2为编码转换
-r result_file_name 指定编码检测结果文件名
Expand Down
23 changes: 10 additions & 13 deletions charset_mnbvc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from .constant import (CCHARDECT_ENCODING_MAP, ENCODINGS, EXT_ENCODING,
REGEX_FEATURE_ALL, TIPS_CONTEXT_RANGE, MAX_ENCODING_SIZE, MAX_INVALID_BYTES_SIZE)

import icu

# compile makes it more efficient
re_char_check = compile(REGEX_FEATURE_ALL)
Expand Down Expand Up @@ -69,7 +68,7 @@ def fix_data(s: str) -> list:
def from_data(data, mode) -> str:
"""
:param data: data
:param mode: 1:cchardet 2:mnbvc, 3:icu
:param mode: 1:cchardet 2:mnbvc
:return: encoding
"""
coding_name = get_cn_charset(
Expand Down Expand Up @@ -135,16 +134,16 @@ def scan_dir(folder_path, ext='.txt'):
return sub_folders, files


def check_by_icu(data):
"""
:param data:data
:return: encoding
"""
encoding = icu.CharsetDetector(data).detect().getName()
# def check_by_icu(data):
# """
# :param data:data
# :return: encoding
# """
# encoding = icu.CharsetDetector(data).detect().getName()

converted_encoding = CCHARDECT_ENCODING_MAP.get(encoding)
# converted_encoding = CCHARDECT_ENCODING_MAP.get(encoding)

return converted_encoding
# return converted_encoding


def check_by_cchardect(data):
Expand Down Expand Up @@ -218,7 +217,7 @@ def check_disorder_chars(file_path, threshold=0.1):
def get_cn_charset(source_data, source_type="file", mode=1, special_encodings=None):
"""
:param source_data: file path
:param mode: 1: mnbvc, 2: cchardet, 3: icu
:param mode: 1: mnbvc, 2: cchardet
:param source_type: file or data
:return: encoding
"""
Expand Down Expand Up @@ -259,8 +258,6 @@ def get_cn_charset(source_data, source_type="file", mode=1, special_encodings=No
data=data, special_encodings=special_encodings)
elif mode == 2:
encoding = check_by_cchardect(data=data)
elif mode == 3:
encoding = check_by_icu(data=data)
else:
sys.stderr.write(f'Error: mode {mode} is not supported.')

Expand Down
2 changes: 1 addition & 1 deletion charset_mnbvc/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""version"""

__version__ = '0.0.12'
__version__ = '0.0.14'
VERSION = __version__.split('.')
62 changes: 31 additions & 31 deletions convert_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from tqdm import tqdm

from charset_mnbvc import api, verify
from icu import UnicodeString
# from icu import UnicodeString

BLOCK_SIZE = 1024 * 1024

Expand Down Expand Up @@ -82,15 +82,15 @@ def revert_files(file_path):

return True

def convert_file_to_utf8_use_icu(input_file, output_file, encoding):
# 打开二进制文件进行读取
with open(input_file, "rb") as f_input:
with open(output_file, "w") as f_output:
data = f_input.read()
# 将读取的数据转换为UTF-8编码
utf8_data = UnicodeString(data, encoding.upper())
# 将转换后的UTF-8数据写入输出文件
f_output.write(str(utf8_data))
# def convert_file_to_utf8_use_icu(input_file, output_file, encoding):
# # 打开二进制文件进行读取
# with open(input_file, "rb") as f_input:
# with open(output_file, "w") as f_output:
# data = f_input.read()
# # 将读取的数据转换为UTF-8编码
# utf8_data = UnicodeString(data, encoding.upper())
# # 将转换后的UTF-8数据写入输出文件
# f_output.write(str(utf8_data))

def convert_file_to_utf8(file):
"""
Expand All @@ -113,28 +113,28 @@ def convert_file_to_utf8(file):
f_out.write(f_in.read())

except Exception as e:
# 检测encoding是否为gbk或者gb18030,调用pyicu进行转换
msg = f"{file_path} {encoding} 转换到utf8失败, {e}"
if encoding.lower() in ["gbk", "gb18030"]:
try:
# encoding = "GBK"
# with open(raw_file_path, "rb") as f:
# data = f.read()
# encoding = api.from_data(data=data, mode=3)

convert_file_to_utf8_use_icu(raw_file_path, file_path, encoding)
msg = msg + f" 重新转换 {encoding},使用 icu 成功"
return True, msg
except Exception as e:
msg = msg + f" 重新转换 {encoding},使用 icu 失败"
os.remove(file_path)
return False, msg
else:
os.remove(file_path)
return False, msg

# os.remove(file_path)
# return False, msg
os.remove(file_path)
return False, msg

# 检测encoding是否为gbk或者gb18030,调用pyicu进行转换
# if encoding.lower() in ["gbk", "gb18030"]:
# try:
# # encoding = "GBK"
# # with open(raw_file_path, "rb") as f:
# # data = f.read()
# # encoding = api.from_data(data=data, mode=3)

# convert_file_to_utf8_use_icu(raw_file_path, file_path, encoding)
# msg = msg + f" 重新转换 {encoding},使用 icu 成功"
# return True, msg
# except Exception as e:
# msg = msg + f" 重新转换 {encoding},使用 icu 失败"
# os.remove(file_path)
# return False, msg
# else:
# os.remove(file_path)
# return False, msg
return True, None


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"Source": "https://github.com/alanshi/charset_mnbvc/",
},
install_requires=[
'cchardet',
'faust-cchardet',
'tqdm'
],
)

0 comments on commit b5143bb

Please sign in to comment.