-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogisticRegression.py
106 lines (82 loc) · 2.56 KB
/
logisticRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
#This is the parser for XML and HTML documents
from bs4 import BeautifulSoup
wordNetLemmatizer = WordNetLemmatizer()
stopWords = set(w.rstrip() for w in open('stopwords.txt'))
#In this case, we are going to parse an xml file
#review_text is the only data require
positiveReviews = BeautifulSoup(open('Electronics/positive.xml'), "lxml")
positiveReviews = positiveReviews.findAll('review_text')
negativeReviews = BeautifulSoup(open('Electronics/negative.xml'), "lxml")
negativeReviews = negativeReviews.findAll('review_text')
#Now we are going to balance the positive reviews
np.random.shuffle(positiveReviews)
positiveReviews = positiveReviews[:len(negativeReviews)]
#####Data preparation to remove stop words#######
#Defining function to remove stop words
def tokenizer(s):
#All word to lowercase
s = s.lower()
tokens = nltk.tokenize.word_tokenize(s)
tokens = [t for t in tokens if len(t) > 2]
tokens = [wordNetLemmatizer.lemmatize(t) for t in tokens]
tokens = [t for t in tokens if t not in stopWords]
return tokens
wordIndexMap = {}
currentIndex = 0
#Word data
positiveTokenized = []
negativeTokenized = []
for review in positiveReviews:
tokens = tokenizer(review.text)
positiveTokenized.append(tokens)
for token in tokens:
if token not in wordIndexMap:
wordIndexMap[token] = currentIndex
currentIndex += 1
for review in negativeReviews:
tokens = tokenizer(review.text)
negativeTokenized.append(tokens)
for token in tokens:
if token not in wordIndexMap:
wordIndexMap[token] = currentIndex
currentIndex += 1
def tokensToVector(tokens, label):
x = np.zeros(len(wordIndexMap) + 1)
for t in tokens:
i = wordIndexMap[t]
x[i] += 1
if x.sum() != 0:
x = x/x.sum()
x[-1] = label
return x
tokenizedNumber = len(positiveTokenized) + len(negativeTokenized)
data = np.zeros((tokenizedNumber, len(wordIndexMap) + 1))
i = 0
for tokens in positiveTokenized:
xy = tokensToVector(tokens, 1)
data[i,:] = xy
i += 1
for tokens in negativeTokenized:
xy = tokensToVector(tokens, 0)
data[i,:] = xy
i += 1
########Building the model#########
np.random.shuffle(data)
x = data[:, :-1]
y = data[:, -1]
xTrain = x[:-100,]
yTrain = y[:-100,]
xTest = x[-100:,]
yTest = y[-100:,]
model = LogisticRegression()
model.fit(xTrain, yTrain)
print "Classification rate:", model.score(xTest, yTest)
threshold = 0.6
for word, index in wordIndexMap.iteritems():
weight = model.coef_[0][index]
if weight > threshold or weight < -threshold:
print word, weight