-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheval_script.py
92 lines (70 loc) · 4.04 KB
/
eval_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import json
import random
from argparse import ArgumentParser
from tabulate import tabulate
from utils.eval_utils import evaluate_answer, evaluate_response
def read_jsonl_to_dict(data_path, output_path, category):
with open(data_path, 'r', encoding='utf-8') as file:
data = {int(parsed_line['id']): parsed_line for line in file if (parsed_line := json.loads(line)).get('category') == category}
with open(output_path, 'r', encoding='utf-8') as file:
output = {int(parsed_line['id']): parsed_line for line in file if int((parsed_line := json.loads(line)).get('id')) in data.keys()}
return data, output
def process_answer_jsonl_file(data_path, output_path, category):
global global_cnt
global global_correct_cnt
data_dict, output_dict = read_jsonl_to_dict(data_path, output_path, category)
assert set(data_dict.keys()) == set(output_dict.keys()), "The ids are not exactly the same and cannot be processed further, please check files"
for data_key, data_value in data_dict.items():
output_dict[data_key]['predicted_answer'] = output_dict[data_key].get('answer')
output_dict[data_key]['answer'] = data_value.get('answer')
results_count = evaluate_answer(output_dict.values())
return results_count
def process_response_jsonl_file(data_path, output_path, category):
global global_cnt
global global_correct_cnt
data_dict, output_dict = read_jsonl_to_dict(data_path, output_path, category)
assert set(data_dict.keys()) == set(output_dict.keys()), "The ids are not exactly the same and cannot be processed further, please check files"
for data_key, data_value in data_dict.items():
if data_value.get('type') == "选择":
index2ans = {
'A': data_value.get('option1', ''),
'B': data_value.get('option2', ''),
'C': data_value.get('option3', ''),
'D': data_value.get('option4', '')
}
output_dict[data_key]['index2ans'] = index2ans
output_dict[data_key]['answer'] = data_value.get('answer')
results_count = evaluate_response(output_dict.values())
return results_count
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--output_path', type=str, default="eval/example/Yi-VL-34B-answer.jsonl", help="The path to model output file.")
parser.add_argument('--data_path', type=str, default="eval/example/cmmmu-data-val-answer.jsonl", help="Answer file path.")
args = parser.parse_args()
category_list = ['艺术与设计', '商业', '科学', '健康与医学', '人文社会科学', '技术与工程']
category_dict = {'艺术与设计': 'Art & Design', '商业': 'Business', '科学': 'Science', '健康与医学': 'Health & Medicine', '人文社会科学': 'Humanities & Social Sciences', '技术与工程': 'Technology & Engineering'}
headers = ['Subject', 'Correct Num', 'Entries Num', 'Acc']
table = []
correct_sum = 0
entries_sum = 0
is_answer = True
is_response = True
with open(args.output_path, 'r') as file:
for line in file:
data = json.loads(line)
if set(data.keys()) != {'id', 'type', 'answer'}:
is_answer = False
if set(data.keys()) != {'id', 'type', 'response'}:
is_response = False
assert is_answer or is_response, "The file should contain either 'answer' or 'response'"
for category in category_list:
if is_answer:
results_count = process_answer_jsonl_file(args.data_path, args.output_path, category)
elif is_response:
results_count = process_response_jsonl_file(args.data_path, args.output_path, category)
correct_sum += results_count['correct_num']
entries_sum += results_count['entries_num']
table.append([category_dict[category], results_count['correct_num'], results_count['entries_num'], results_count['acc']])
table.append(['Overall', correct_sum, entries_sum, correct_sum/entries_sum])
print(tabulate(table, headers=headers, tablefmt='orgtbl'))