-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_raw_data.py
108 lines (93 loc) · 3.6 KB
/
parse_raw_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
PARSE_JSON_INDEX = "/home/appleternity/workspace/lab/event_detection/data/event_detection.new.parse.json.index"
EVENT_DETECTION_JSON = "./data/event_detection.json"
TRIGGER_TSV = './data/wiki_sentence_annotated.with_trigger.tsv'
def parse_file():
file_name = TRIGGER_TSV
with open(file_name, 'r', encoding='utf-8') as infile:
data = []
for index, line in enumerate(infile):
row = line.split("\t")
if index % 1000 == 0:
print(index)
sent = row[3]
print(sent)
argument_list = []
trigger_word = None
for entity in row[4:]:
entity = entity.replace(", ", "++++++++++")
split_entity = entity.split(",")
split_entity[2].replace("++++++++++", ", ")
# print(split_entity)
info, _, start, end = split_entity
start, end = int(start), int(end)
word = sent[start:end]
if info == "negative":
continue
if info[-7:] == "trigger":
trigger_word = (word, info, start, end)
else:
argument_list.append((word, info, start, end))
data.append({
"sentence":sent,
"trigger_word":trigger_word,
"argument_list":argument_list
})
with open(EVENT_DETECTION_JSON, 'w', encoding='utf-8') as outfile:
json.dump(data, outfile, indent=4)
def add_entity():
event_data_path = PARSE_JSON_INDEX
with open(event_data_path, 'r', encoding='utf-8') as infile:
data = json.load(infile)
file_name = TRIGGER_TSV
with open(file_name, 'r', encoding='utf-8') as infile:
for index, line in enumerate(infile):
row = line.split('\t')
sent = row[3]
for entity in row[4:]:
entity = entity.replace(", ", "++++++++++")
split_entity = entity.split(",")
split_entity[2].replace("++++++++++", ", ")
info, _, start, end = split_entity
start, end = int(start), int(end)
word = sent[start:end]
if info[-7:] == "trigger":
continue
if info == "negative":
continue
def test_same_sent():
file_name = TRIGGER_TSV
with open(file_name, 'r', encoding='utf-8') as infile:
sentences = [
line.split("\t")[3]
for line in infile
]
dictionary = {}
for sent in sentences:
if sent not in dictionary:
dictionary[sent] = 1
else:
dictionary[sent] += 1
same_count = sum([1 for key, val in dictionary.items() if val != 1])
print("same_count:", same_count)
def test_trigger_num():
file_name = TRIGGER_TSV
with open(file_name, 'r', encoding='utf-8') as infile:
multi_trigger_count = 0
for line in infile:
row = line.split('\t')
trigger_count = 0
for entity in row[4:]:
entity = entity.replace(", ", "++++++++++")
split_entity = entity.split(",")
split_entity[2].replace("++++++++++", ", ")
info, _, start, end = split_entity
if info[-7:] == "trigger":
trigger_count += 1
if trigger_count > 1:
multi_trigger_count += 1
print("multi_trigger_count:", multi_trigger_count)
if __name__ == "__main__":
parse_file()
#test_same_sent()
#test_trigger_num()