-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregex_preprocess.py
84 lines (73 loc) · 3.5 KB
/
regex_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
ch = re.compile(r'(百分之|千分之|万分之|第)*((一|二|三|四|五|六|七|八|九|十|零|两|○)+(千|万|亿|百|点|分之|比|几)*)+')
num = re.compile(r'(第)*([0-9]|[0-9]|-|-|—|%|%|\+|\.|.|·|:|∶|/)+')
seq = re.compile(r'(([A-Z]|[a-z]|[A-Z]|[a-z]|[0-9]|[0-9]|-|-|—|%|%|\+|\.|.|·|:|∶|/)+)')
time = re.compile(r'([0-9]|[0-9])+(年|月|日|小时|点钟|时|分|秒)')
purehannum = re.compile(r'^(一|二|三|四|五|六|七|八|九|零|○)(一|二|三|四|五|六|七|八|九|零|○)+$')
# 把符合正则的部分替换成特定token,返回一个被替换部分的迭代器
def regexreplace(line):
ch = re.compile(r'(百分之|千分之|万分之|第)*((一|二|三|四|五|六|七|八|九|十|零|两|○)+(千|万|亿|百|点|分之|比|几)*)+')
num = re.compile(r'(第)*([0-9]|[0-9]|-|-|—|%|%|\+|\.|.|·|:|∶|/)+')
seq = re.compile(r'(([A-Z]|[a-z]|[A-Z]|[a-z]|[0-9]|[0-9]|-|-|—|%|%|\+|\.|.|·|:|∶|/)+)')
time = re.compile(r'([0-9]|[0-9])+(年|月|日|小时|点钟|时|分|秒)')
# TODO regexlist 解耦
reg_list = [(ch, 'è'), (seq, '§')] # 元素是正则表达式和对应token的二元组
iter_list = []
for reg in reg_list:
it = re.finditer(reg[0], line)
iter_list.append(it)
line = re.sub(reg[0], reg[1], line, 0)
return line, iter_list
# 将正则标记恢复原样
# 一个词里出现两个标记可能会出bug
def regex_recover(word, iter_list):
# print(word)
for i in range(len(word)):
# print(word[i])
if word[i] == 'è':
# word[i] = str(next(iter_list[0]).group())
try:
word = word[:i] + str(next(iter_list[0]).group()) + word[i + 1:]
except StopIteration:
print('stopit')
# print(i)
if word[i] == '§':
try:
word = word[:i] + str(next(iter_list[1]).group()) + word[i + 1:]
except StopIteration:
print('stopit')
# print(i)
return word, iter_list
# 检查一个字符串是非汉字符号串
def regexcheck(line):
eng_regex = r'^([A-Z]|[a-z]|[A-Z]|[a-z]|[0-9]|[0-9]|-|-|—|%|%|\+|\.|.|·|:|∶|/)+$'
# 这个暂时用不到
date_regex = r'^(\d+-)+\d+$'
if re.match(eng_regex, line):
# print('eng')
return True
elif re.match(date_regex, line):
return True
return False
# 自己测试用,请忽略
if __name__ == '__main__':
"""with open('dividedemo.txt', "r", encoding='utf-8') as file:
for line in file:
line = line.strip('\n')
line = regexcheck(line)
# line=line.decode("utf-8")
print(line)
"""
seq = re.compile(r'(([A-Z]|[a-z]|[A-Z]|[a-z]|[0-9]|[0-9]|-|-|—|%|%|\+|\.|.|·|:|∶|/)+)') # re.I 表示忽略大小写
pattern2 = re.compile(r'(([a-z]|[0-9])+)', re.I)
time = re.compile(r'月|日|小时|点钟|时|分|秒')
ch = re.compile(r'(百分之|千分之|万分之|第)*((一|二|三|四|五|六|七|八|九|十|零|两|○|)+(千|万|亿|百|点|分之|比|几)*)+')
num = re.compile(r'(第)*([0-9]|[0-9]|-|-|—|%|%|\+|\.|.|·|:|∶|/)+')
m = seq.findall('18-94年pc2我12%')
# print(m[1].start())
pline = re.sub(ch, '♟', '去北京大学第一玩', 0)
print(m)
print(pline)
it = re.finditer(ch, '去北京大学第一玩')
for match in it:
print(match.group())