From 594b881338ed7b9e2f3caf9a07efbdebfe483ca2 Mon Sep 17 00:00:00 2001 From: alan Date: Wed, 31 Jan 2024 11:45:10 +0800 Subject: [PATCH] imporve pre_check.py --- README.md | 10 +++++++++ pre_check.py | 57 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index ff03d47..15c7c27 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,16 @@ optional arguments: ###################################### Step3 end ###################################### ``` +#### pre_check.py使用范例 +``` +usage: convert_files.py [-h] [-p PROCESS_NUM] [-m MODE] -i inputDirectory [-r check_result_file_name] +convert_files.py: error: the following arguments are required: -i + +python pre_check.py -i /Users/alan/temp_test/20230101/aliyun.20230101.8.武侠小说 +检测进度1: 100%|█████████████████████████████████████████████████████████████████| 3724/3724 [00:01<00:00, 3153.64its] +检测进度2: 100%|█████████████████████████████████████████████████████████████████| 3724/3724 [00:00<00:00, 4288.96its] +已将检测出错结果保存至 check_result_1706672423.csv 文件中,请查阅! +``` #### wiki地址: https://wiki.mnbvc.org/doku.php/ylzq diff --git a/pre_check.py b/pre_check.py index b0816f4..6cf01fd 100644 --- a/pre_check.py +++ b/pre_check.py @@ -1,5 +1,8 @@ +import csv +import time import argparse + from concurrent.futures import ProcessPoolExecutor from pathlib import Path @@ -8,6 +11,7 @@ from charset_mnbvc import api, verify from charset_mnbvc.common_utils import get_file_paths + def parse_args(): """ 解析命令行参数 @@ -37,6 +41,14 @@ def parse_args(): dest='folder_path', help='inputDirectory为需要检测的目录' ) + parser.add_argument( + '-r', + required=False, + default=f'check_result_{int(time.time())}.csv', + metavar='check_result_file_name', + dest='check_result_file_name', + help='指定编码检测结果文件名' + ) return parser.parse_args() @@ -44,16 +56,12 @@ def encoding_check(inputs): """ 编码检测 """ - # file_count, files = api.from_dir( - # folder_path=inputs.folder_path, - # mode=inputs.mode - # ) files = [] results = [] files = get_file_paths(inputs.folder_path, suffix='.txt') with ProcessPoolExecutor(inputs.process_num) as executor: futures = [] - with tqdm(desc="检测进度", total=len(files)) as pbar: + with tqdm(desc="检测进度1", total=len(files)) as pbar: # 提交任务,并将Future对象添加到futures列表中 for file in files: future = executor.submit(api.from_file, file, inputs.mode) @@ -66,25 +74,35 @@ def encoding_check(inputs): return results + def convert_check(file_path, encoding): + result = { + "file_path": file_path, + "status": True, + "msg": "success" + } with open(file_path, "rb") as f: data = f.read() if not data: - return False, f"{file_path}, 文件为空" + result['status'] = False + result['msg'] = "文件为空" try: - ret = api.decode_check(data, encoding) - if not ret[0]: - return False, f"{file_path}, {ret[1]}" + ret = api.decode(data, encoding) + if not ret: + result['status'] = False + result['msg'] = "文件为空" except Exception as e: - return False, f"{file_path}, {str(e)}" + result['status'] = False + result['msg'] = str(e) + + return result - return True, "success" def process(files, inputs): results = [] with ProcessPoolExecutor(inputs.process_num) as executor: futures = [] - with tqdm(desc="检测进度", total=len(files)) as pbar: + with tqdm(desc="检测进度2", total=len(files)) as pbar: for file in files: file_path = file[0] encoding = file[1] @@ -101,16 +119,15 @@ def process(files, inputs): def main(): inputs = parse_args() - input_folder = inputs.folder_path - mode = inputs.mode - process_num = inputs.process_num encoding_results = encoding_check(inputs) process_results = process(encoding_results, inputs) - for i in process_results: - if not i[0]: - print(i) - + with open(inputs.check_result_file_name, 'w', newline='') as file: + writer = csv.writer(file) + for row in process_results: + if not row['status']: + writer.writerow([row['file_path'], row['msg']]) + print(f"已将检测出错结果保存至 {inputs.check_result_file_name} 文件中,请查阅!") if __name__ == "__main__": - main() \ No newline at end of file + main()