-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtrain_postag.py
73 lines (56 loc) · 1.35 KB
/
train_postag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
import nltk
import pickle
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tree import Tree
from probCalc import probability as PB
from tagger import Tagger
UNIQ='_UNIQUE_STRING_'
"""
"""
def save_obj(obj, name ):
with open('obj/'+ name + '.pkl', 'wb+') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
"""
"""
def train_postag(corpus="dataset/postagged.txt"):
global TAGGER
data = []
r = re.compile(r"""\\(.)""")
f = open(corpus)
lines = f.read().split('\n')
f.close()
for line in lines:
fx_line = r.sub(r'\1', line)
sen_tag = []
for word_tag in fx_line.split(" "):
splitter_index = word_tag.rfind("/")
word = word_tag[:splitter_index]
tag = word_tag[splitter_index + 1:]
sen_tag.append((word, tag))
data.append(sen_tag)
save_obj(PB(data), "tagger")
"""
"""
def train_postag2(corpus="dataset/postag_ui.txt"):
global TAGGER
data = []
print "read file..."
f = open(corpus)
lines = f.read().split('\n\n')
f.close()
for line in lines:
fx_line = line.split('\n')
sen_tag = []
for _word in fx_line:
word_tag = _word.split("\t")
word = word_tag[0]
tag = word_tag[1]
sen_tag.append((word, tag))
data.append(sen_tag)
print "end read file"
print "training pos tagger..."
save_obj(PB(data), "tagger2")
print "end training pos tagger"
train_postag2()