Skip to content

Commit

Permalink
imporve pre_check.py
Browse files Browse the repository at this point in the history
  • Loading branch information
alanshi committed Jan 31, 2024
1 parent 1ccb033 commit 594b881
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 20 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,16 @@ optional arguments:
###################################### Step3 end ######################################
```

#### pre_check.py使用范例
```
usage: convert_files.py [-h] [-p PROCESS_NUM] [-m MODE] -i inputDirectory [-r check_result_file_name]
convert_files.py: error: the following arguments are required: -i
python pre_check.py -i /Users/alan/temp_test/20230101/aliyun.20230101.8.武侠小说
检测进度1: 100%|█████████████████████████████████████████████████████████████████| 3724/3724 [00:01<00:00, 3153.64its]
检测进度2: 100%|█████████████████████████████████████████████████████████████████| 3724/3724 [00:00<00:00, 4288.96its]
已将检测出错结果保存至 check_result_1706672423.csv 文件中,请查阅!
```

#### wiki地址:
https://wiki.mnbvc.org/doku.php/ylzq
Expand Down
57 changes: 37 additions & 20 deletions pre_check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import csv
import time
import argparse


from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

Expand All @@ -8,6 +11,7 @@
from charset_mnbvc import api, verify
from charset_mnbvc.common_utils import get_file_paths


def parse_args():
"""
解析命令行参数
Expand Down Expand Up @@ -37,23 +41,27 @@ def parse_args():
dest='folder_path',
help='inputDirectory为需要检测的目录'
)
parser.add_argument(
'-r',
required=False,
default=f'check_result_{int(time.time())}.csv',
metavar='check_result_file_name',
dest='check_result_file_name',
help='指定编码检测结果文件名'
)
return parser.parse_args()


def encoding_check(inputs):
"""
编码检测
"""
# file_count, files = api.from_dir(
# folder_path=inputs.folder_path,
# mode=inputs.mode
# )
files = []
results = []
files = get_file_paths(inputs.folder_path, suffix='.txt')
with ProcessPoolExecutor(inputs.process_num) as executor:
futures = []
with tqdm(desc="检测进度", total=len(files)) as pbar:
with tqdm(desc="检测进度1", total=len(files)) as pbar:
# 提交任务,并将Future对象添加到futures列表中
for file in files:
future = executor.submit(api.from_file, file, inputs.mode)
Expand All @@ -66,25 +74,35 @@ def encoding_check(inputs):

return results


def convert_check(file_path, encoding):
result = {
"file_path": file_path,
"status": True,
"msg": "success"
}
with open(file_path, "rb") as f:
data = f.read()
if not data:
return False, f"{file_path}, 文件为空"
result['status'] = False
result['msg'] = "文件为空"
try:
ret = api.decode_check(data, encoding)
if not ret[0]:
return False, f"{file_path}, {ret[1]}"
ret = api.decode(data, encoding)
if not ret:
result['status'] = False
result['msg'] = "文件为空"
except Exception as e:
return False, f"{file_path}, {str(e)}"
result['status'] = False
result['msg'] = str(e)

return result

return True, "success"

def process(files, inputs):
results = []
with ProcessPoolExecutor(inputs.process_num) as executor:
futures = []
with tqdm(desc="检测进度", total=len(files)) as pbar:
with tqdm(desc="检测进度2", total=len(files)) as pbar:
for file in files:
file_path = file[0]
encoding = file[1]
Expand All @@ -101,16 +119,15 @@ def process(files, inputs):

def main():
inputs = parse_args()
input_folder = inputs.folder_path
mode = inputs.mode
process_num = inputs.process_num
encoding_results = encoding_check(inputs)
process_results = process(encoding_results, inputs)
for i in process_results:
if not i[0]:
print(i)

with open(inputs.check_result_file_name, 'w', newline='') as file:
writer = csv.writer(file)
for row in process_results:
if not row['status']:
writer.writerow([row['file_path'], row['msg']])
print(f"已将检测出错结果保存至 {inputs.check_result_file_name} 文件中,请查阅!")


if __name__ == "__main__":
main()
main()

0 comments on commit 594b881

Please sign in to comment.