-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPostProcess.py
129 lines (115 loc) · 4.43 KB
/
PostProcess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
ENCODING = 'gbk'
import re
from CBGM import *
from regex_preprocess import *
# from Evaluate import evaluation
from BMM import PortableBMM
"""def continue_singleton(seg):
total_char = 0
num = len(seg)
continue_single = 0
for i in len(seg):
total_char += len(seg[i])
if len(seg[i]) == 1 and len(seg[i + 1] == 1):
continue_single += 1"""
def timestamp(seg):
# print(seg[0])
if len(seg[0]) > 1 and seg[0][0] == '§':
newseg = []
newseg.append('§')
newseg.append(seg[0][1:])
newseg.extend(seg[1:])
return newseg
return seg
def CBGM_poct(trainfile, testfile, outresultfile):
# 初始化
model = CharacterBasedGenerativeModel()
pb = PortableBMM()
# 训练两个模型
model.train(trainfile)
pb.train(trainfile)
with open(testfile, 'r', encoding=ENCODING) as file, \
open(outresultfile, 'w', encoding=ENCODING) as result:
# 文件文本预处理
for line in file:
line = line.strip('\n')
if line == '':
result.write('\n')
continue
# 正则处理
line, iter_list = regexreplace(line)
# CBGM返回的label序列
predict_token = model.segmentation(line)
# label序列转成分出的词的列表
seg = token2seg(predict_token, line)
# 拆开【正则+汉字】形式的词
seg = timestamp(seg)
# 计算分词数与字数比,比值过大改用BMM
if len(seg) / len(line) > 0.75:
seg = pb.seg(line)
# 还原正则
post_regex_seg = []
for word in seg:
word, iter_list = regex_recover(word, iter_list)
post_regex_seg.append(word)
# 处理大写年份
for i in range(len(post_regex_seg) - 1):
if re.match(purehannum, post_regex_seg[i]) and post_regex_seg[i + 1] == '年':
post_regex_seg[i + 1] = post_regex_seg[i] + '年'
post_regex_seg.pop(i)
break
# TODO 双版本
# 过长序列做拆分,逻辑不太朴实,估计只对训练语料有效,第二版里没有这个
if len(post_regex_seg[0]) > 19:
post_regex_seg.insert(1, post_regex_seg[0][19:])
post_regex_seg.insert(1, post_regex_seg[0][:19])
post_regex_seg.pop(0)
# 输出
for word in post_regex_seg:
result.write(str(word) + '/ ')
result.write('\n')
# 跟第一版几乎一样 删去了长序列拆分这一步
def CBGM_poct2(trainfile, testfile, outresultfile):
# 初始化
model = CharacterBasedGenerativeModel()
pb = PortableBMM()
# 训练两个模型
model.train(trainfile)
pb.train(trainfile)
with open(testfile, 'r', encoding=ENCODING) as file, \
open(outresultfile, 'w', encoding=ENCODING) as result:
# 文件文本预处理
for line in file:
line = line.strip('\n')
if line == '':
result.write('\n')
continue
# 正则处理
line, iter_list = regexreplace(line)
# CBGM返回的label序列
predict_token = model.segmentation(line)
# label序列转成分出的词的列表
seg = token2seg(predict_token, line)
# 拆开【正则+汉字】形式的词
seg = timestamp(seg)
# 计算分词数与字数比,比值过大改用BMM
if len(seg) / len(line) > 0.75:
seg = pb.seg(line)
# 还原正则
post_regex_seg = []
for word in seg:
word, iter_list = regex_recover(word, iter_list)
post_regex_seg.append(word)
# 处理大写年份
for i in range(len(post_regex_seg) - 1):
if re.match(purehannum, post_regex_seg[i]) and post_regex_seg[i + 1] == '年':
post_regex_seg[i + 1] = post_regex_seg[i] + '年'
post_regex_seg.pop(i)
break
# 输出
for word in post_regex_seg:
result.write(str(word) + '/ ')
result.write('\n')
if __name__ == '__main__':
# print(timestamp(['§胜利', 'hao', 'd']))
CBGM_poct('0203.txt', '199801_sent.txt', 'posttest.txt')