-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathinfo.py
248 lines (211 loc) · 7.47 KB
/
info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from __future__ import division
from math import log, exp
from operator import mul
from collections import Counter
import os
import pickle
import detectlanguage
from config import API_KEY, TRANS_KEY, TRANS_KEY2
from textblob import TextBlob
import langid
import logging
import json
from pos import pos
from neg import neg
from pos2 import pos2
from neg2 import neg2
import requests
detectlanguage.configuration.api_key = API_KEY
class MyDict(dict):
def __getitem__(self, key):
if key in self:
return self.get(key)
return 0
features = set()
totals = [3321176, 3320100]
totals2 = [3321176, 3320100]
delchars = ''.join(c for c in map(chr, range(128)) if not c.isalnum())
# CDATA_FILE = "countdata.pickle"
FDATA_FILE = "reduceddata.pickle"
FDATA_FILE2 = "reduceddata2.pickle" # for browser data analysis
def negate_sequence(text):
"""
Detects negations and transforms negated words into "not_" form.
"""
negation = False
delims = "?.,!:;"
result = []
words = text.split()
prev = None
pprev = None
for word in words:
# stripped = word.strip(delchars)
stripped = word.strip(delims).lower()
negated = "not_" + stripped if negation else stripped
result.append(negated)
if prev:
bigram = prev + " " + negated
result.append(bigram)
if pprev:
trigram = pprev + " " + bigram
result.append(trigram)
pprev = prev
prev = negated
if any(neg in word for neg in ["not", "n't", "no"]):
negation = not negation
if any(c in word for c in delims):
negation = False
return result
def classify2(text):
"""
For classification from pretrained data
"""
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0): return True, 0
# Probability that word occurs in pos documents
# for word in words:
# print "p: ", word, ": ", pos[word]
# print "n: ", word, ": ", neg[word]
# print words
pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words)
neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words)
return (pos_prob > neg_prob, abs(pos_prob - neg_prob))
def classify3(text):
"""
For classification from pretrained data
"""
words = set(word for word in negate_sequence(text) if word in pos2 or word in neg2)
logging.debug(' words len = ' + str(len(words)))
if (len(words) == 0): return True, 0
# Probability that word occurs in pos documents
pos_prob = sum(log((pos2[word] + 1) / (2 * totals[0])) for word in words)
neg_prob = sum(log((neg2[word] + 1) / (2 * totals[1])) for word in words)
return (pos_prob > neg_prob, abs(pos_prob - neg_prob))
def classify_demo(text):
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0):
print "No features to compare on"
return True
pprob, nprob = 0, 0
for word in words:
pp = log((pos[word] + 1) / (2 * totals[0]))
np = log((neg[word] + 1) / (2 * totals[1]))
print "%15s %.9f %.9f" % (word, exp(pp), exp(np))
pprob += pp
nprob += np
print ("Positive" if pprob > nprob else "Negative"), "log-diff = %.9f" % abs(pprob - nprob)
def feature_selection_trials():
"""
Select top k features. Vary k and plot data
"""
# global pos, neg, totals, features
# global pos2, neg2, totals2
# retrain = False
#
# if not retrain and os.path.isfile(FDATA_FILE):
# pos, neg, totals = pickle.load(open(FDATA_FILE))
# pos2, neg2, totals2 = pickle.load(open(FDATA_FILE2))
return
class LangDetect():
def __init__(self, text):
self.host = 'https://translate.yandex.net'
self.api_key = TRANS_KEY
self.trans_to = 'en'
self.trans_from = ''
self.url_detect = '/api/v1.5/tr.json/detect?hint=en,de,ur&key=%s' % self.api_key
self.url_trans = '/api/v1.5/tr.json/translate?lang=%s%s&key=%s' % (self.trans_from, self.trans_to, self.api_key)
self.headers = {'content-type': 'application/x-www-form-urlencoded'}
self.text = text
def change_api_key(self):
self.api_key = TRANS_KEY2
self.url_detect = '/api/v1.5/tr.json/detect?hint=en,de,ur&key=%s' % self.api_key
self.url_trans = '/api/v1.5/tr.json/translate?lang=%s%s&key=%s' % (self.trans_from, self.trans_to, self.api_key)
def connect(self):
response = requests.post(
self.host + self.url_detect, data={'text': self.text}, headers=self.headers)
if response.status_code !=200:
raise Exception('Error! Yandex')
return response
def detect(self):
try:
response = self.connect()
except Exception, e:
self.change_api_key()
response = self.connect()
if response.status_code == 200:
lang_id = json.loads(response.text)['lang'].split('-')[0]
else:
raise Exception("Yandex did not return status code 200")
return lang_id
def translate(self, t_from):
self.trans_from = t_from + '-'
response = requests.post(
self.host + self.url_trans, data={'text': self.text}, headers=self.headers
)
if response.status_code == 200:
trans_text = json.loads(response.text)['text'][0]
else:
raise Exception("Yandex did not return status code 200")
return trans_text
# def lang_detect_level1(lang, gs):
# lang_id = TextBlob(lang).detect_language() # lang_id = en
# if lang_id in ['en', 'ar', 'bn', 'hi', 'ur']:
# if lang_id == 'hi':
# l_id = 'rd'
# else:
# l_id = lang_id
# else:
# l_id = 'na'
# return {'language_id': l_id, 'language': gs.get_languages()[lang_id]}
def lang_detect_level1(lang, gs):
lang_id = LangDetect(lang).detect()
in_english = True
try:
lang.decode('ascii')
except:
in_english = False
if lang_id in ['en', 'ar', 'bn', 'ur', 'nl']:
l_id = lang_id
elif in_english:
l_id = 'rd'
else:
l_id = 'na'
return {'language_id': l_id, 'language': gs.get_languages()[lang_id]}
def lang_detect_level2(lang, gs):
status = detectlanguage.user_status()
if status['status'] == 'ACTIVE':
in_english = True
try:
lang.decode('ascii')
except:
in_english = False
lang_id = detectlanguage.detect(lang)
# e.g [{'isReliable': True, 'confidence': 12.04, 'language': 'es'}]
if lang_id[0]['language'] in ['en', 'ar', 'bn', 'ur', 'nl']:
l_id = lang_id[0]['language']
elif in_english:
l_id = 'rd'
else:
l_id = 'na'
return {'language_id': l_id, 'language': gs.get_languages()[lang_id[0]['language']]}
else:
raise Exception('Account Suspended')
def lang_detect_level3(lang, gs):
# langid service, source code = https://github.com/saffsd/langid.py
res = langid.classify(lang)
in_english = True
try:
lang.decode('ascii')
except:
in_english = False
if res[0] in ['en', 'ar', 'bn', 'ur', 'nl', 'es']:
l_id = res[0]
elif in_english:
l_id = 'rd'
else:
l_id = 'na'
return {'language_id': l_id, 'language': gs.get_languages()[res[0]]}
if __name__ == '__main__':
feature_selection_trials()
def setup():
feature_selection_trials()