-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathintelliSeg.py
194 lines (158 loc) · 4.64 KB
/
intelliSeg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: LiSnB
# @Date: 2014-05-29 12:46:15
# @Last Modified by: LiSnB
# @Last Modified time: 2014-06-07 17:25:46
# @Email: [email protected]
"""
# @comment here:
"""
from __future__ import division
from hmm import viterbi
from mm import fmm,maxprob,maxprob_viterbi
from configs import config
if config.USE_CHARDETECT:
import chardet_
HANDLERS={
'f':fmm.seg,
'm':maxprob.seg,
'v':viterbi.seg,
'x':maxprob_viterbi.seg
}
class EncodingError(Exception):
def __init__(self,value):
self.value = value
def __str__(self):
return 'Unhandlable Encoding: %s\n'%self.value
# def handle_encoding(sentence):
def segit(sentence,handler):
if not handler:
segments = ['[not a valid segment handler.]']
return segments
# print handler
encoding = 'utf-8'
# try:
if config.USE_CHARDETECT:
s_encoding = chardet_.detect(sentence)['encoding']
if s_encoding.lower() == 'utf-8':
pass
# elif s_encoding.lower() in ['cp936','gbk','gb2312','ascii']:
else:
encoding = s_encoding
# else:
# raise EncodingError(s_encoding)
# else:
# if config.ISWINDOWS:
# encoding='gb2312'
segments=[]
ascii_sentence = sentence if encoding=='ascii' else sentence.decode(encoding)
validsentence = config.re_chinese.split(ascii_sentence)
for vs in validsentence:
if config.re_chinese.match(vs):
# print vs
segments.extend(handler(vs))
else:
puncs = config.re_punc.split(vs)
for pe in puncs:
if pe.strip()!='':
segments.append(pe)
return segments
# except EncodingError as e:
# print e
# except Exception as e:
# print e
def seg(inputcontent,handlers='xfvm',is_file=False):
handlers=list(handlers)
if is_file:
with open(inputcontent) as f:
sentence = f.read()
# print chardet.detect(sentence)
else:
sentence=inputcontent
segments = [segit(sentence,HANDLERS.get(handler,None)) for handler in handlers]
if is_file:
for h,segment in zip(handlers,segments):
filename = '%s.%s.seg.%s'%(inputcontent,h,'txt' if config.ISWINDOWS else '')
with open(filename,'wb') as f:
content = '/'.join(segment)
content=content.encode('utf-8')
f.write(content)
else:
print
for h,segment in zip(handlers,segments):
print '[%s]: %s\n'%(h,'/'.join(segment))
return segments
usage="""
usage: intelliSeg.py -i <inputcontent> [-s <segmentmethod>] [-f <isfile>]
detail:
-h help infomation
-s choose the methed(s) to cut :
f: Forward Maximum Matching
m: Maximum Probability Path
v: HMM & Viterbi ('B','M','E','S')
x: Hybrid method via v and f (default)
you can combine them when you want to compare the results:
eg: -s fm
>> return the results of both methods
if you don't specify, default:
fmv
if you input something else, it will return a hint
eg: -s p "hello world"
>> [p]:[not a valid segment handler.]
-i the content you want to cut.
it can be both a sentence and a path to the file containing
the sentence
you are supposed to surround the content with quotes when it
cantains space.
eg: valid: "hello world"
invalid: hello world
if a sentence is provided, the result will be output directly
eg: -i "hello world" -s fm
>>[f]: hello/world
>>[m]: hello/world
if a file path is provided, there will be some other files
added to the directory
eg: -i "/foo/biz.txt" -f -s fv
>> the file "/foo/biz.txt_f.seg"
and "/foo/biz_v.seg"
will be created, the content
is the same as output
if it is run on a Windows OS, there will be a '.txt' too
in case you want to open it with notepad .
-f specify -f means the input is a file. if it is just a single
sentence, ignore it.
-d use a user-defined dict
eg: -d '../dicts/user-defined.dict'
your dict should be in this format
<word> <freq>
"""
if __name__ == '__main__':
import sys,getopt
opts,args = getopt.getopt(sys.argv[1:], 'hi:s:d:f')
handlers='x'
isfile=False
inputcontent=''
userdict='^'
if len(opts) is 0:
print usage
for op,v in opts:
if op == '-h':
print usage
elif op == '-s':
handlers=v
elif op == '-f':
isfile = True
elif op == '-i':
inputcontent=v
elif op == '-d':
userdict=v
if not inputcontent:
print 'You should provide a content at least. with -h to check the usage.'
exit(1)
if userdict != '' and userdict != '^':
config.initialize(userdict)
elif userdict == '':
print 'User Dict Invalid, Use Default'
seg(inputcontent, handlers,isfile)
# print handlers,isfile,inputcontent